Пример #1
0
    def content_based_calculator(self, stringofwords):
        divider = re.compile('\\W*')
        res=[x.lower() for x in divider.split(stringofwords) if x!= '']
        content_scores={}
        paperid_index={}
        content_out={}
        index=1
        for word in res:
            if word in self.wordlocations:
                for papid in self.wordlocations[word]:
                    papid_score=len(self.wordlocations[word][papid])
                    if papid not in content_scores:
                        content_scores[papid]=papid_score
                        paperid_index[papid]=index
                    else:
                        content_scores[papid]=content_scores[papid]*papid_score
                        paperid_index[papid]+=1
            else:
                continue

        for pid in paperid_index:
            if paperid_index[pid]==len(res):
                content_out[pid]=content_scores[pid]

        inst=searchengine.searcher('database')
        content_out=inst.normalizescores(content_out)
        self.contentscore= content_out
def generateFig(filePath):
    e=searchengine.searcher('searchindex.db')
    frequencies= e.getFrequentWords()
    # take relative word frequencies into account, lower max_font_size
    #wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
    wordcloud = WordCloud(font_path='/home/jamin/Documents/resource/msyh.ttf',background_color="white",stopwords=STOPWORDS.add(u"黄豆"),max_font_size=40, relative_scaling=.25).fit_words(frequencies)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig(filePath)
Пример #3
0
def makeindex(key):
    e = s.searcher('searchindex.db')
    result = e.query(key)
    List = []
    size = len(result)
    for i in range(size):
        for j in result[i]:
            List.append(e.geturlname(j))
    return List
Пример #4
0
def pageRank():
  reload(searchengine)
  crawler=searchengine.crawler('searchindex.db')
  e=searchengine.searcher('searchindex.db')
  #crawler.calculatepagerank( )
  cur=crawler.con.execute('select * from pagerank order by score desc')
  for i in range(3): 
   d=cur.next()
   print d,e.geturlname(d[0])
Пример #5
0
def makeindex(key):
  e = s.searcher('searchindex.db')
  result = e.query(key)
  List = []
  size = len(result)
  for i in range(size):
    for j in result[i]:
      List.append(e.geturlname(j))
  return List
Пример #6
0
def queryhandler():
    e = searchengine.searcher()
    q = bottle.request.forms.get("query")
    mywords, myurls = e.query(q)
    s = bottle.request.environ.get('beaker.session')
    s['mywords'] = mywords
    s['myurls'] = myurls
    s.save()
    bottle.redirect('/results')
Пример #7
0
def test_calculate_pagerank():
    sys.stderr.write("testing pagerank calculation...\n")
    crawler=searchengine.crawler('searchindex.db')
    crawler.calculatepagerank()
    sys.stderr.write("checking pagerank result...\n")
    cur=crawler.con.execute('select * from pagerank order by score desc')
    for i in range(3): print cur.next()
    sys.stderr.write("checking pagerank top url...\n")
    e=searchengine.searcher('searchindex.db')
    urlid=cur.next()[0]
    print e.geturlname(urlid)
Пример #8
0
 def testQueryIndian(self):
     wordids=[]
     rows=[]
     if self.config.queries == None or len(self.config.queries) <= 0:
         queries=['memory', 'mental', 'mind', 'storage', 'magnetic', 'cache', 'psychological', 'semiconductor', 'transistor', 'random access', 'data storage']
     else:
         queries = self.config.queries
     s=searcher(self.dbname)
     if self.numusers >= 1:
         for q in queries:
             for userid in [x+1 for x in range(self.numusers)]:
                 wordids,rows = s.query(q, userid)
Пример #9
0
def firesearch():
    outputwidget.delete(1.0,END)
    fillconfig()
    s=searcher(config.dbname)
    q=queryvar.get()
    urllist=[]
    try:
        widlist,urlidlist = s.query(q,config.userid,config.userurlhitscoresweight)
        for urlid in urlidlist:
            url=s.geturlname(urlid)
            urllist.append(url)
        outputwidget.insert(END, '\n'.join(urllist))
    except:
        print "Error:", sys.exc_info()
        tkMessageBox.showerror("Input Error", sys.exc_info())
        raise
def serve_search(environ, start_response):

  query_words = ''
  results = ''
  if 'QUERY_STRING' in environ:
    query_dict = cgi.parse_qs(environ['QUERY_STRING'])
    if 'q' in query_dict:
      # parse_qs returns a list for values as query parameters can appear
      # several times (e.g. 'q=ddsview&q=makeicns'). Ignore all but the first
      # occurence of q.
      query_words = query_dict['q'][0]
      s = searchengine.searcher('searchindex.db')
      results = '<br>\n'.join(['%f: <a href="%s">%s</a>' % (score, url, url)
        for score, url in s.query(query_words)])
      results = results.encode('utf-8')

  # Note: this also returns html for favicon queries.
  start_response('200 OK',[('Content-type','text/html')])
  return [template % locals()]
Пример #11
0
 def pagerank_calculator(self, iterations=20):
     pageranks={}
     for item in self.citations:
         pageranks.setdefault(item,1.0)
     for i in range(iterations):
         #print 'Iteration' %i
         pr=0.15
         for item in pageranks:
             init_score=0
             for element in self.citations[item]:
                 if element not in pageranks:
                     val=1.0
                 else:
                     val=pageranks[element]
                 linknum=self.citationcounts[element]
                 init_score+=float(val/linknum)
             pageranks[item]=pr+(0.85*init_score)
     inst=searchengine.searcher('database')
     pageranks=inst.normalizescores(pageranks)
     #print pageranks['9402117']
     self.pagerankscore=pageranks
def serve_search(environ, start_response):

    query_words = ''
    results = ''
    if 'QUERY_STRING' in environ:
        query_dict = cgi.parse_qs(environ['QUERY_STRING'])
        if 'q' in query_dict:
            # parse_qs returns a list for values as query parameters can appear
            # several times (e.g. 'q=ddsview&q=makeicns'). Ignore all but the first
            # occurence of q.
            query_words = query_dict['q'][0]
            s = searchengine.searcher('searchindex.db')
            results = '<br>\n'.join([
                '%f: <a href="%s">%s</a>' % (score, url, url)
                for score, url in s.query(query_words)
            ])
            results = results.encode('utf-8')

    # Note: this also returns html for favicon queries.
    start_response('200 OK', [('Content-type', 'text/html')])
    return [template % locals()]
def generatePosNegFile(filepath):
    f=open(filepath,'w')
    e=searchengine.searcher('searchindex.db')
    cursor= e.con.execute(
            " select * from urllist where posnegscore is not null order by posnegscore desc limit 3  " )
    for row in cursor:
        f.write(row[0])
        f.write("\t")
        f.write(str(row[1]))
        f.write("\t")
        f.write("pos")
        f.write("\n")
    cursor= e.con.execute(
            " select * from urllist where posnegscore is not null order by posnegscore asc limit 3  " )
    for row in cursor:
        f.write(row[0])
        f.write("\t")
        f.write(str(row[1]))
        f.write("\t")
        f.write("neg")
        f.write("\n")
    f.close()
#coding:utf-8
#!/usr/bin/env python
__author__ = 'dick'

import searchengine

craw = searchengine.crawler('searchindex.db')
# craw.createindextables()
pages = [
    # 'http://www.bbc.com/',
    'https://www.hao123.com/?1477704964',
    # 'https://www.baidu.com',
]

# craw.crawl(pages)

e = searchengine.searcher('searchindex.db')
print e.getmatchrows('hao weather yes')
Пример #15
0
'''
Created on Feb 16, 2014

@author: ssashita
A query is to be given as 
 python runquery.py 1 functional programming
 {1 is the userid, and the rest are the query words} 
'''

from searchengine import searcher
import sys
import cconfigurator

if __name__ == '__main__':
    config = cconfigurator.configure('crawled.db')
    listargs=[]
    if len(sys.argv) > 2:
        for arg in sys.argv[2:]:
            listargs.append(arg)
        s=searcher('crawled.db')
        s.query(' '.join([str(x) for x in listargs]),sys.argv[1])
    else:
        print("At least 3 args required. Second one is the userid and rest are the query words")
Пример #16
0
def query():
    e = searchengine.searcher('searchIndex.db')
    print e.query('functional programming...')
Пример #17
0
def testquery(q = 'functional programming'):
   search = searchengine.searcher()
   search.query(q)
Пример #18
0
    print '\n'

if __name__ == '__main__':
    '''
2. Boolean operations. Many search engines support Boolean queries, which allow
users to construct searches like "python OR perl." An OR search can work by
doing the queries separately and combining the results, but what about "python
AND (program OR code)"? Modify the query methods to support some basic
Boolean operations.
3. Exact matches. Search engines often support "exact match" queries, where the
words in the page must match the words in the query in the same order with no
additional words in between. Create a new version of getrows that only returns
results that are exact matches. (Hint: you can use subtraction in SQL to get the
difference between the word locations.) 
    '''
    dbname = 'searchindex.db'
    if True:
        crawler = se.crawler(dbname)
        crawler.createindextables()
        pages = [
            'https://www.zhihu.com/',
            'https://github.com/'
        ]
        crawler.crawl(pages, depth=2)
        crawler.calculatepagerank()
    else:
        searcher = se.searcher(dbname)
        q = 'zhihu career'
        print searcher.query(q)

def test_se_search():
    searcher = se.searcher('crawler.db')
    result = searcher.query('python language blog')
    print result
Пример #20
0
def test_query_ranking(weightFunc):
    sys.stderr.write("testing query with weighting function '%s'...\n" % weightFunc)
    e=searchengine.searcher('searchindex.db')
    print e.query('programming',weightFunc)
Пример #21
0
from flask import Flask, render_template, request, redirect
import searchengine, neuralnet, crawler
searcher = searchengine.searcher('searchengine.db')
crawler = crawler.crawler('searchengine.db')
nnet = neuralnet.searchnet('nn.db')


app = Flask(__name__)


@app.route("/")
def search():
	if request.args:
		queryText = request.args.get('q')
		(wordids, scores, urlIdsList, urlsList) = searcher.query(queryText)
		if len(urlIdsList) != 0:
			listOfItems = [{'id': urlIdsList[i], 'url': urlsList[i], 'score': scores[i]} for i in range(len(urlIdsList))]
		else:
			listOfItems = []
		return render_template('index.html', list=listOfItems, q=queryText)
	return render_template('index.html', list=None)


@app.route('/train', methods=['POST', 'GET'])
def train():		
	if request.method == 'POST':
		queryPhrase = request.json['q']
		selectedURLId = int(request.json['clicked'])
		app.logger.debug('queryPhrase: %s => selectedURLId: %s' %(queryPhrase, selectedURLId))
		(wordids, scores, urlIdsList, urlsList) = searcher.query(queryPhrase)
		nnet.trainquery(wordids, urlIdsList, selectedURLId)
Пример #22
0
def test_full_match_words():
    s = searcher("output/search.db")
    print s.getfullmatchrows("simple web page")
Пример #23
0
def wordFrequency():
  reload(searchengine)
  e=searchengine.searcher('searchindex.db')
  e.query('sqlite3 python')
Пример #24
0
from django.conf.urls.static import static
from django.shortcuts import render
from django.http import HttpResponse, HttpRequest
from django.shortcuts import render_to_response
from django.template import RequestContext
import searchengine
import nn

e = searchengine.searcher('wikipedia.db')
allurls = e.getallurls("functional")

# Create your views here.


def home_view(request):
    return HttpResponse(request.method)


def search_string(request):
    query = request.GET['searchquery']
    data = e.query(query)
    context_dict = {'results': data, 'query': query}
    return render_to_response('results_page.html', context_dict)


def train_nn(request, page_alias, selected_result):
    network = nn.searchnet('nn.db')
    words = e.getwordids(page_alias)
    if (selected_result.endswith("/")):
        selected_result = selected_result[:-1]
    urlid = e.geturlid(selected_result)
 def setUp(self):
     self.s = searchengine.searcher("test.db")
Пример #26
0
def query():
	e=searchengine.searcher('searchIndex.db')
	print e.query('functional programming...')
Пример #27
0
import searchengine
pages = ['https://news.google.com.tw/']
crawler = searchengine.crawler('test')
crawler.createindextables()  #create tables

crawler.crawl(pages)

crawler.caculatepagerank()

e = searchengine.searcher('test')
e.query('單場 球季')
Пример #28
0
def contentranking():
  reload(searchengine)
  #mynet=nn.searchnet('nn.db')
  #mynet.maketables()
  e=searchengine.searcher('searchindex.db')
  e.query('sqlite3 python')
Пример #29
0
# -*- coding: utf-8 -*-
from tornado.ioloop import IOLoop
from tornado.web import RequestHandler, Application, url, StaticFileHandler
import os.path
import sys

sys.path.insert(0, os.path.abspath("../collective-intelligence"))

import searchengine as se
searcher = se.searcher("index.db")

foofle_data = {"query" : "",
               "results" : []}

def update_data(query):
    foofle_data["query"] = query
    foofle_data["results"] = searcher.query(query)

class MainHandler(RequestHandler):
    def initialize(self, data):
        self.data = data

    def get(self):
        self.render("index.html", query = self.data["query"], results = self.data["results"])

    def post(self):
        query = self.get_argument("input-query")
        print "La busqueda que se realizara utilizara la cadena '%s' como consulta" % query
        update_data(query)
        self.get()
Пример #30
0
def documentLocation():
  reload(searchengine)
  e=searchengine.searcher('searchindex.db')
  e.query('sqlite3 python')
Пример #31
0
def test_getmatchrows():
    sys.stderr.write("testing get match rows...\n")
    e=searchengine.searcher('searchindex.db')
    print e.getmatchrows('programming')
Пример #32
0
## coding:utf-8 ##
import searchengine

e=searchengine.searcher('searchindex.db')
#print e.getmatchrows('perl python functional')
while 1:
	print "输入查询的单词(en)"
	q=raw_input()
	print e.query(q)
Пример #33
0
def test_query():
    sys.stderr.write("testing query...\n")
    e=searchengine.searcher('searchindex.db')
    print e.query('programming')
Пример #34
0
for test in eval_tests:
    node = ast.parse(test)
    print ast.dump(node)
    # MyVisitor().visit(node)
    print '\n'

if __name__ == '__main__':
    '''
2. Boolean operations. Many search engines support Boolean queries, which allow
users to construct searches like "python OR perl." An OR search can work by
doing the queries separately and combining the results, but what about "python
AND (program OR code)"? Modify the query methods to support some basic
Boolean operations.
3. Exact matches. Search engines often support "exact match" queries, where the
words in the page must match the words in the query in the same order with no
additional words in between. Create a new version of getrows that only returns
results that are exact matches. (Hint: you can use subtraction in SQL to get the
difference between the word locations.) 
    '''
    dbname = 'searchindex.db'
    if True:
        crawler = se.crawler(dbname)
        crawler.createindextables()
        pages = ['https://www.zhihu.com/', 'https://github.com/']
        crawler.crawl(pages, depth=2)
        crawler.calculatepagerank()
    else:
        searcher = se.searcher(dbname)
        q = 'zhihu career'
        print searcher.query(q)
Пример #35
0
#!/usr/bin/python
# coding: UTF-8
# Author: David
# Email: [email protected]
# Created: 2016-08-01 14:08
# Last modified: 2016-08-01 15:54
# Filename: search_test.py
# Description:
import searchengine
e = searchengine.searcher()
e.query('form authentication')