def search(): if request.method =='POST': searchTerm = request.form['searchTermInput'] engine = SearchEngine() resultList = engine.search(searchTerm.lower()) return render_template("search.html", resultList=resultList, listSize=len(resultList)) return render_template("search.html")
def index(): if request.method =='POST': startingURL = request.form['urlInput'] engine = SearchEngine() numPagesIndexed, numWordsIndexed = engine.indexFrom(startingURL, MAX_DEPTH) return render_template("index.html", numPagesIndexed=numPagesIndexed, numWordsIndexed=numWordsIndexed) return render_template("index.html")
def main(): searchEn = SearchEngine() searchEn.startSearchEngine() window = sg.Window("My Search Engine", layout) while True: event, values = window.read() if event is None: searchEn.closeConnection() break if event == "search": searchEn.searchInterface(values["IN"]) window.close()
def setUp(self): with open('test0.txt', 'w') as f: f.write('All we need is,\n all we need is,\n all we need is') with open('test1.txt', 'w') as f: f.write('Blood, blood,\n blood') with open('test2.txt', 'w') as f: f.write('All we need is, all we need is,\n all we need is') with open('test.txt', 'w') as f: f.write('All we need is, all we need is, all we need is') with open('testtest.txt', 'w') as f: f.write('Blood, blood, blood') with open('testtesttest.txt', 'w') as f: f.write('All we need is, all we need is,\n all we need is') with open('testSentence.txt', 'w') as f: f.write( 'What do we need? All we need is blood. Pain pain pain pain') indexer = Indexator('TestDatabase') indexer.indexize('test0.txt') indexer.indexize('test1.txt') indexer.indexize('test2.txt') self.searchEngine = SearchEngine("TestDatabase")
def main(): print("Start crawling, please wait") crawler = WebCrawler("https://s2.smu.edu/~fmoore/index.htm", 200) crawler.setup() crawler.crawl() crawler.buildTFMatrix() crawler.printTFMatrix() crawler.topNWords(20) crawler.printInfo() print("Crawling completed, results are save to result.txt and tf_matrix.csv") print("Starting query search") engine = SearchEngine(crawler) engine.loadThesaurus("thesaurus.csv") while True: query = input("Please input query or stop to terminate query search:") # convert to lower case query = query.lower() if query == "stop": print("Thanks for using!") break engine.engine(query) print("Done") print('+++++++++++++++++++++++++++++++++++++++++')
def genData(snippets, filename): for row, snippet in enumerate(snippets): yield { "_index": "news_prog", "id": (filename, row + 2), "snippet": snippet } if __name__ == "__main__": dirPath = os.path.dirname(os.path.realpath(__file__)) dataPath = os.path.realpath(os.path.join(dirPath, "data")) files = [os.path.join(dataPath, file) for file in sorted(os.listdir(dataPath))] # Our Search Engine engine = SearchEngine() # Elasticsearch es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) for file in files: snippets = getSnippets(file) filename = int(os.path.split(file)[1].split(".csv")[0]) helpers.bulk(es, genData(snippets, filename)) while True: print("\n\n0. Standard query") print("1. Allows positional indexing") print("2. Allows wildcard terms") print("3. Allows both wildcards and positional indexing")
from invertedIndex import InvertedIndex import datetime from searchEngine import SearchEngine from pathlib import Path import json #x = datetime.datetime.now() ''' invertIndex = InvertedIndex("DEV") invertIndex.readIndex(r"DEV") ''' #y = datetime.datetime.now() search = SearchEngine("DEV") search.searchInterfaceCommandLine()
# from rss import Rss # # module = Rss('chosun') # module.scrap() # from crawler import Crawler # # module = Crawler('chosun') # result = module.scrap('http://news.chosun.com/site/data/html_dir/2018/10/08/2018100802172.html') # print(result) from searchEngine import SearchEngine module = SearchEngine('chosun') module.do_search('화재')
import urllib, urllib2, requests import markupsafe import os, json, datetime, logging import model from webapp2_extras import sessions from google.appengine.api import channel from searchimagery import ScriptEngine from searchEngine import SearchEngine from geolocation import Geolocation from YoutubeSearch import YtubeSearch geolocation = Geolocation() searchEngine = SearchEngine() dbWrapper = model.DataStoreWrapper class QueryHandler(webapp2.RequestHandler): def dispatch(self): # Get a session store for this request. self.session_store = sessions.get_store(request=self.request) try: # Dispatch the request. webapp2.RequestHandler.dispatch(self) finally: # Save all sessions. self.session_store.save_sessions(self.response) def getSession(self):
from flask import Flask, render_template, request, redirect from searchEngine import SearchEngine app = Flask(__name__) se = SearchEngine("index") @app.route("/") def redir(): return render_template("index.html") # POST REQUEST TO GET RESULTS # CHECK IF USER ACTUALLY SUBMITS A QUERY LATER! @app.route("/results", methods=['POST', 'GET']) def getResults(): if request.method == "POST": query = request.form["query"] results = se.search(query, 5) return render_template("results.html", results=results) @app.route("/back") def goBack(): return render_template("index.html") # main function if __name__ == "__main__": app.run()
# test search engine methods from searchEngine import SearchEngine se = SearchEngine("index") print(se.search("cristina lopes", 5)) print(se.search("machine learning", 5)) print(se.search("ACM", 5)) print(se.search("master of software engineering", 5)) # import pickle # # file = open("pIndex1.pkl", "rb") # d = pickle.load(file) # file.close() # # print(len(d)) # for k, v in d.items(): # #print(k,v) # print(k, ": ", v) # LINKED LIST OR SET UF POSTINGS? # IF YOU USE A SET, YOU NEED TO IMPLEMENT THE __EQ__ ETC AND HASH # USE STEMMING TO CUT DOWN ON # OF ENTRIES IN INDICES # MERGING STRATEGY # have an index for every letter # create a partial index # go through that sorted index and load to memory each letter
class Test(unittest.TestCase): def setUp(self): with open('test0.txt', 'w') as f: f.write('All we need is,\n all we need is,\n all we need is') with open('test1.txt', 'w') as f: f.write('Blood, blood,\n blood') with open('test2.txt', 'w') as f: f.write('All we need is, all we need is,\n all we need is') with open('test.txt', 'w') as f: f.write('All we need is, all we need is, all we need is') with open('testtest.txt', 'w') as f: f.write('Blood, blood, blood') with open('testtesttest.txt', 'w') as f: f.write('All we need is, all we need is,\n all we need is') with open('testSentence.txt', 'w') as f: f.write( 'What do we need? All we need is blood. Pain pain pain pain') indexer = Indexator('TestDatabase') indexer.indexize('test0.txt') indexer.indexize('test1.txt') indexer.indexize('test2.txt') self.searchEngine = SearchEngine("TestDatabase") # unittests for search def test_input_type_number(self): with self.assertRaises(ValueError): self.searchEngine.search(13) def test_input_type_not_exists(self): self.assertEqual(self.searchEngine.search('вискас'), {}) def test_we(self): expected = { 'test0.txt': [ indexator.Position(4, 6, 1), indexator.Position(5, 7, 2), indexator.Position(5, 7, 3) ], 'test2.txt': [ indexator.Position(4, 6, 1), indexator.Position(20, 22, 1), indexator.Position(5, 7, 2) ] } self.assertEqual(self.searchEngine.search('we'), expected) def test_blood(self): expected = { 'test1.txt': [indexator.Position(7, 12, 1), indexator.Position(1, 6, 2)] } self.assertEqual(self.searchEngine.search("blood"), expected) # unittests for searchQuery def test__query_input_type_number(self): with self.assertRaises(ValueError): self.searchEngine.searchQuery(13) def test_query_input_type_not_exists(self): self.assertEqual(self.searchEngine.searchQuery('вискас'), {}) def test_we_is(self): expected = { 'test0.txt': [ indexator.Position(4, 6, 1), indexator.Position(5, 7, 2), indexator.Position(5, 7, 3), indexator.Position(12, 14, 1), indexator.Position(13, 15, 2), indexator.Position(13, 15, 3) ], 'test2.txt': [ indexator.Position(4, 6, 1), indexator.Position(20, 22, 1), indexator.Position(5, 7, 2), indexator.Position(12, 14, 1), indexator.Position(28, 30, 1), indexator.Position(13, 15, 2) ] } self.assertEqual(self.searchEngine.searchQuery('we is'), expected) def test_need(self): expected = { 'test0.txt': [ indexator.Position(7, 11, 1), indexator.Position(8, 12, 2), indexator.Position(8, 12, 3) ], 'test2.txt': [ indexator.Position(7, 11, 1), indexator.Position(23, 27, 1), indexator.Position(8, 12, 2) ] } self.assertEqual(self.searchEngine.searchQuery('need'), expected) # unittests for contexts def test_context(self): pos = indexator.Position(20, 22, 1) context = searchEngine.ContextWindow.makeWindowGreatAgain( 2, 'test.txt', pos) self.assertEqual(context.string, "is, all we need is") def test_context_line_not_exists(self): pos = indexator.Position(20, 22, 2) with self.assertRaises(ValueError): searchEngine.ContextWindow.makeWindowGreatAgain(2, 'test.txt', pos) def test_context_large_size(self): pos = indexator.Position(20, 22, 1) context = searchEngine.ContextWindow.makeWindowGreatAgain( 8, 'test.txt', pos) self.assertEqual(context.string, "All we need is, all we need is, all we need is") def test_context_zero_size(self): pos = indexator.Position(20, 22, 1) context = searchEngine.ContextWindow.makeWindowGreatAgain( 0, 'test.txt', pos) self.assertEqual(context.string, "we") def test_context_two_windows(self): poss = [indexator.Position(20, 22, 1), indexator.Position(32, 35, 1)] contexts = [ searchEngine.ContextWindow.makeWindowGreatAgain( 2, 'test.txt', poss[0]), searchEngine.ContextWindow.makeWindowGreatAgain( 2, 'test.txt', poss[1]) ] contextUnion = searchEngine.ContextWindow().unionWindows(contexts) targetTokensPositions = [ indexator.Position(20, 22, 1), indexator.Position(32, 35, 1) ] expected = searchEngine.ContextWindow.initWithData( "All we need is, all we need is, all we need is", targetTokensPositions, 43, 12, "is, all we need is, all we need", "test.txt", 1) expectedList = [] expectedList.append(expected) self.assertEqual(contextUnion, expectedList) def test_context_many_windows(self): poss = [ indexator.Position(20, 22, 1), indexator.Position(32, 35, 1), indexator.Position(7, 12, 1), indexator.Position(20, 22, 1), indexator.Position(28, 30, 1), indexator.Position(1, 4, 2) ] contexts = [ searchEngine.ContextWindow.makeWindowGreatAgain( 2, 'test.txt', poss[0]), searchEngine.ContextWindow.makeWindowGreatAgain( 2, 'test.txt', poss[1]), searchEngine.ContextWindow.makeWindowGreatAgain( 1, 'testtest.txt', poss[2]), searchEngine.ContextWindow.makeWindowGreatAgain( 8, 'testtesttest.txt', poss[3]), searchEngine.ContextWindow.makeWindowGreatAgain( 2, 'testtesttest.txt', poss[4]), searchEngine.ContextWindow.makeWindowGreatAgain( 2, 'testtesttest.txt', poss[5]) ] contextUnion = searchEngine.ContextWindow().unionWindows(contexts) targetTokensPositions1 = [ indexator.Position(20, 22, 1), indexator.Position(32, 35, 1) ] expected1 = searchEngine.ContextWindow.initWithData( "All we need is, all we need is, all we need is", targetTokensPositions1, 43, 12, "is, all we need is, all we need", "test.txt", 1) targetTokensPositions2 = [indexator.Position(7, 12, 1)] expected2 = searchEngine.ContextWindow.initWithData( "Blood, blood, blood", targetTokensPositions2, 19, 0, "Blood, blood, blood", "testtest.txt", 1) targetTokensPositions3 = [ indexator.Position(20, 22, 1), indexator.Position(28, 30, 1) ] expected3 = searchEngine.ContextWindow.initWithData( "All we need is, all we need is,\n", targetTokensPositions3, 30, 0, "All we need is, all we need is", "testtesttest.txt", 1) targetTokensPositions4 = [indexator.Position(1, 4, 2)] expected4 = searchEngine.ContextWindow.initWithData( " all we need is", targetTokensPositions4, 12, 1, "all we need", "testtesttest.txt", 2) expectedList = [] expectedList.append(expected1) expectedList.append(expected2) expectedList.append(expected3) expectedList.append(expected4) self.assertEqual(contextUnion, expectedList) def test_context_expand_to_sentence(self): pos = indexator.Position(24, 28, 1) context = searchEngine.ContextWindow.makeWindowGreatAgain( 1, 'testSentence.txt', pos) context.expandToSentence() targetTokensPositions = [indexator.Position(24, 28, 1)] expected = searchEngine.ContextWindow.initWithData( "What do we need? All we need is blood. Pain pain pain pain", targetTokensPositions, 38, 17, "All we need is blood.", "testSentence.txt", 1) self.assertEqual(context, expected) def test_context_expand_to_sentence_two_tokens(self): poss = [indexator.Position(21, 23, 1), indexator.Position(24, 28, 1)] contexts = [ searchEngine.ContextWindow.makeWindowGreatAgain( 1, 'testSentence.txt', poss[0]), searchEngine.ContextWindow.makeWindowGreatAgain( 1, 'testSentence.txt', poss[1]) ] contextUnion = searchEngine.ContextWindow().unionWindows(contexts) contextUnion[0].expandToSentence() context = contextUnion[0] targetTokensPositions = [ indexator.Position(21, 23, 1), indexator.Position(24, 28, 1) ] expected = searchEngine.ContextWindow.initWithData( "What do we need? All we need is blood. Pain pain pain pain", targetTokensPositions, 38, 17, "All we need is blood.", "testSentence.txt", 1) self.assertEqual(context, expected) # def test_query_context(self): # expected = { # 'test.txt': [ # indexator.Position(4, 6, 1), # indexator.Position(5, 7, 2), # indexator.Position(5, 7, 3), # indexator.Position(12, 14, 1), # indexator.Position(13, 15, 2), # indexator.Position(13, 15, 3)], # 'test2.txt': [ # indexator.Position(4, 6, 1), # indexator.Position(20, 22, 1), # indexator.Position(5, 7, 2), # indexator.Position(12, 14, 1), # indexator.Position(28, 30, 1), # indexator.Position(13, 15, 2)]} # print(searchEngine.ContextWindow.makeWindowGreatAgain( # 3, 'test0.txt', indexator.Position(12, 14, 1),)) # self.assertEqual(self.searchEngine.searchQueryWindow('blood pain', 3), expected) def tearDown(self): self.searchEngine.__del__() files = os.listdir(path=".") for file in files: if file.startswith('TestDatabase'): os.remove(file) if file.startswith('test'): os.remove(file)