def __init__(self, seeds=[]): """ init with seeds Init with seeds Create/Open a file for storing progress """ #self.settings = settings url = next(iter(SEEDS)) parsed_uri = urlparse(url) self.url_base = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) self.searchengine = SearchEngine() self.searchengine.db_connect() self.crawl_book = CrawlWorkbook(path=WWW_DIR, url=seeds[0].url) self.crawl_book.wb_open() # /! Will have to go in a Frontera Middleware at some point # retrieve weighted_links, weighted_links_done... self.weighted_links = self.crawl_book.weighted_links self.weighted_links_done = self.crawl_book.weighted_links_done self.ignore_seeds = self.crawl_book.ignore_seeds self.ignored_pages = self.crawl_book.ignored_pages self.add_seeds(seeds) # build requests from weighted_links for wl in self.weighted_links: self.requests.append(requests.Request(url=wl.url)) for wl in self.weighted_links_done: self.requests_done.append(requests.Request(url=wl.url)) # ignore list ignore_suffixes = [ '/es/', '/fr/', '/ca/', '/newsletters', '/2021/', '/2020/01/', '/2020/02/', '/2020/03/', '/2020/04/', '/2020/05/', '/2020/06/', '/2020/07/', '/2020/08/', '/2020/09/', '/2020/10/', '/2019/', '/2018/', '/2017/', '/2016/', '/2015/', '/2014/', '/section/world', '/video/world', '/section/food', '/section/arts', '/section/sports', '/section/science', '/section/books', '/section/travel', '/section/realestate', '/section/fashion', '/section/technology', '/section/politics', '/section/business', '/section/style', '/section/well', '/section/style/love', '/section/us', '/section/video', '/section/interactive', '/section/magazine', '/international', '/section/t-magazine', '/section/live', '/live', '/video', '/interactive', '/issue/fashion', '/subscription', '/subscriptions', '/section/business/dealbook', '/pages/business/dealbook', '/privacy' ] if not self.ignore_seeds: self.ignore_seeds = [ WeightedLink(url=urljoin(self.url_base, suffix)) for suffix in ignore_suffixes ] self.crawl_book.ws_writerows( WORKBOOK['crawler']['worksheet']['ignoreseeds']['TITLE'], self.ignore_seeds)
def index(): if request.method == 'POST': itemName = request.form['nm'] se = SearchEngine() bigw = se.searchBigW(itemName) dj=se.searchDavidJones(itemName) return render_template('search.html',bigwItems = bigw,djItems=dj) else: return render_template('search.html',bigwItems = [], djItems = [])
def test_already_extended_window(self): self.indexator = Indexer('database') test_file_one = open('test_already_extended_window.txt', 'w') test_file_one.write('Alina Zakharova is a student!!') test_file_one.close() self.indexator.get_index_with_line('test_already_extended_window.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_already_extended_window.txt', Position_Plus(0, 16, 18), 2) os.remove('test_already_extended_window.txt')
class SearchController: def __init__(self): self.searchEngine = SearchEngine() def search(self, params): print(params) search_input = params.get('search_input') stop_words = params.get('stop_words') lemmatize = bool(params.get('lemmatize')) if (stop_words != None): search_input = self.searchEngine.deleteStopWords( search_input, int(stop_words)) if (lemmatize == True): result = self.searchEngine.search_lemmatized(search_input, 20) else: result = self.searchEngine.search_unlemmatized(search_input, 20) return result
def test_myError_str_not_found(self): self.indexator = Indexer('database') test_file_one = open('test_window_five.txt', 'w') test_file_one.write('Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_window_five.txt') del self.indexator self.search = SearchEngine('database') with self.assertRaises(TypeError): result = windows.Context_Window.get_window( 'test_window_five.txt', Position_Plus(3, 21, 28), 3) os.remove('test_window_five.txt')
class Input(): s = SearchEngine() def __init__(self): with open('inputLinks.txt', 'r') as file : links = file.read().replace('\n', '').replace('\r','') links = links.split(',') self.s.init(links) def search(self, query): file1 = open("output.txt","w+") file1.write(str(self.s.searchString(query))) file1.close() return self.s.searchString(query)
def test_highlight_window_one(self): self.indexator = Indexer('database') test_file_one = open('test_highlight_window.txt', 'w') test_file_one.write('Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_highlight_window.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window('test_highlight_window.txt', Position_Plus(0, 6, 15), 1) result = window.highlight_window() output_string = 'Alina <b>Zakharova</b> is' self.assertEqual(result, output_string) os.remove('test_highlight_window.txt')
def test_extend_window_rus_one(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window_rus.txt', 'w') test_file_one.write('Пьер с грустью слышал над собою насмешки.') test_file_one.close() self.indexator.get_index_with_line('test_extend_window_rus.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_extend_window_rus.txt', Position_Plus(0, 0, 4), 1) window.extend_window() extended_window = Context_Window( 'Пьер с грустью слышал над собою насмешки.', [Position_Plus(0, 0, 4)], 0, 41) self.assertEqual(window, extended_window)
def test_extend_window(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window.txt', 'w') test_file_one.write('Alina Zakharova is a student!!') test_file_one.close() self.indexator.get_index_with_line('test_extend_window.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window('test_extend_window.txt', Position_Plus(0, 6, 15), 1) window.extend_window() extended_window = Context_Window('Alina Zakharova is a student!!', [Position_Plus(0, 6, 15)], 0, 30) self.assertEqual(window, extended_window) os.remove('test_extend_window.txt')
def test_not_crossed(self): self.indexator = Indexer('database') test_file_one = open('test_not_crossed_window.txt', 'w') test_file_one.write('The girl named Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_not_crossed_window.txt') del self.indexator self.search = SearchEngine('database') window_A = windows.Context_Window.get_window( 'test_not_crossed_window.txt', Position_Plus(0, 31, 33), 1) window_B = windows.Context_Window.get_window( 'test_not_crossed_window.txt', Position_Plus(0, 8, 14), 1) crossed_AB = window_A.is_crossed(window_B) self.assertEqual(False, crossed_AB) os.remove('test_not_crossed_window.txt')
def page_save_to_file(self, request, soup): """ save page to a #.html file where # is the title crc32 TODO: save request """ # eg. /var/www/html/apple.com/256.html url_hash = SearchEngine.hash_url(request.url) file_name = os.path.join(HTML_DIR, urlsplit(request.url).netloc, str(url_hash) + '.html') print(file_name) os.makedirs(os.path.join(HTML_DIR, urlsplit(request.url).netloc), exist_ok=True) f = open(file_name, 'w') f.write(soup.prettify()) f.close()
def test_extend_window_rus_two(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window_rus.txt', 'w') test_file_one.write( 'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.' ) test_file_one.close() self.indexator.get_index_with_line('test_extend_window_rus.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_extend_window_rus.txt', Position_Plus(0, 34, 38), 1) window.extend_window() extended_window = Context_Window( 'С разных сторон виднелись пожары. Пьер тогда еще не понимал значения сожженной Москвы и с ужасом смотрел на эти пожары.', [Position_Plus(0, 34, 38)], 0, 119) self.assertEqual(window, extended_window)
def test_extend_window_rus(self): self.indexator = Indexer('database') test_file_one = open('test_extend_window_rus.txt', 'w') test_file_one.write( 'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.' ) test_file_one.close() self.indexator.get_index_with_line('test_extend_window_rus.txt') del self.indexator self.search = SearchEngine('database') window = windows.Context_Window.get_window( 'test_extend_window_rus.txt', Position_Plus(0, 28, 36), 1) window.extend_window() extended_window = Context_Window( 'Прогать очень сложно! Алина Захарова студент лингвист!! Аня любит немецкий. В Петербурге идет дождь.', [Position_Plus(0, 28, 36)], 22, 55) self.assertEqual(window, extended_window)
def test_get_window_begin(self): self.indexator = Indexer('database') test_file_one = open('test_window_three.txt', 'w') test_file_one.write('Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_window_three.txt') del self.indexator self.search = SearchEngine('database') result = windows.Context_Window.get_window('test_window_three.txt', Position_Plus(0, 0, 5), 1) self.win = Context_Window('string', 'positions', 'win_start', 'win_end') self.win.string = 'Alina Zakharova is a student' self.win.positions = [Position_Plus(0, 0, 5)] self.win.win_start = 0 self.win.win_end = 15 self.assertEqual(result.string, self.win.string) self.assertEqual(result.positions, self.win.positions) self.assertEqual(result.win_start, self.win.win_start) self.assertEqual(result.win_end, self.win.win_end) self.assertEqual(result, self.win) os.remove('test_window_three.txt')
def test_get_window_simple_plus(self): self.indexator = Indexer('database') test_file_one = open('test_window_two.txt', 'w') test_file_one.write('Little Alina Zakharova is a linguist student)))') test_file_one.close() self.indexator.get_index_with_line('test_window_two.txt') del self.indexator self.search = SearchEngine('database') result = windows.Context_Window.get_window('test_window_two.txt', Position_Plus(0, 23, 25), 2) self.win = Context_Window('string', 'positions', 'win_start', 'win_end') self.win.string = 'Little Alina Zakharova is a linguist student)))' self.win.positions = [Position_Plus(0, 23, 25)] self.win.win_start = 7 self.win.win_end = 36 self.assertEqual(result.string, self.win.string) self.assertEqual(result.positions, self.win.positions) self.assertEqual(result.win_start, self.win.win_start) self.assertEqual(result.win_end, self.win.win_end) self.assertEqual(result, self.win) os.remove('test_window_two.txt')
def test_united_window(self): self.indexator = Indexer('database') test_file_one = open('test_united_window.txt', 'w') test_file_one.write('The girl named Alina Zakharova is a student') test_file_one.close() self.indexator.get_index_with_line('test_united_window.txt') del self.indexator self.search = SearchEngine('database') window_A = windows.Context_Window.get_window('test_united_window.txt', Position_Plus(0, 4, 20), 1) window_B = windows.Context_Window.get_window('test_united_window.txt', Position_Plus(0, 9, 30), 1) window_A.get_united_window(window_B) self.win = windows.Context_Window( 'The girl named Alina Zakharova is a student', [Position_Plus(0, 4, 20), Position_Plus(0, 9, 30)], 9, 20) self.assertEqual(window_A.string, self.win.string) self.assertEqual(window_A.win_start, self.win.win_start) self.assertEqual(window_A.win_end, self.win.win_end) os.remove('test_united_window.txt')
class BookInventory(object): _BOOK_META_ID_INDEX = 0 _BOOK_META_TITLE_INDEX = 1 #question _BOOK_META_AUTHOR_INDEX = 2 #answer _NO_RESULTS_MESSAGE = 'Sorry, no results.' def __init__(self, filename): self.filename = filename self.engine = SearchEngine() @timed def load_books(self): processor = BookDataPreprocessor() with open(self.filename) as catalog: for entry in catalog: book_desc = processor.preprocess(entry) metadata = ' '.join(book_desc[self._BOOK_META_TITLE_INDEX:]) iid = book_desc[self._BOOK_META_ID_INDEX].strip() title = book_desc[self._BOOK_META_TITLE_INDEX].strip() author = book_desc[self._BOOK_META_AUTHOR_INDEX].strip() book = Book(iid, title, author, metadata) self.engine.add_object(book) self.engine.start() @timed def search_books(self, query, n_results=10): result = '' if len(query) > 0: result = self.engine.search(query, n_results) if len(result) > 0: return '\n'.join([str(indexable) for indexable in result]) return self._NO_RESULTS_MESSAGE def books_count(self): return self.engine.count()
def anytime_gbfs(initial_state, heur_fn, timebound=10): # IMPLEMENT """Provides an implementation of anytime greedy best-first search, as described in the HW1 handout""" """INPUT: a rush hour state that represents the start state and a timebound (number of seconds)""" """OUTPUT: A goal state (if a goal is found), else False""" """implementation of anytime greedybfs algorithm""" search_engine = SearchEngine("best_first", "full") search_engine.init_search(initial_state, rushhour_goal_fn, heur_fn) gval_cost_bound = float("inf") time_left = timebound init_time = os.times()[0] solution = search_engine.search( timebound=time_left, costbound=(gval_cost_bound, float("inf"), float("inf")) ) finish_time = os.times()[0] time_left -= finish_time - init_time if solution: gval_cost_bound = solution.gval else: return False while time_left > 0: init_time = os.times()[0] improved_solution = search_engine.search( timebound=time_left, costbound=(gval_cost_bound, float("inf"), float("inf")) ) time_left -= os.times()[0] - init_time if improved_solution: gval_cost_bound = improved_solution.gval solution = improved_solution else: break return solution
def anytime_weighted_astar(initial_state, heur_fn, weight=1.0, timebound=10): # IMPLEMENT """Provides an implementation of anytime weighted a-star, as described in the HW1 handout""" """INPUT: a rush hour state that represents the start state and a timebound (number of seconds)""" """OUTPUT: A goal state (if a goal is found), else False""" """implementation of weighted astar algorithm""" time_left = timebound wrapped_fval_function = lambda sN: fval_function(sN, weight) se = SearchEngine("custom", "full") se.init_search(initial_state, rushhour_goal_fn, heur_fn, wrapped_fval_function) cost_bound = float("inf") init_time = os.times()[0] solution = se.search( timebound=time_left, costbound=(float("inf"), float("inf"), cost_bound) ) finish_time = os.times()[0] time_left -= finish_time - init_time if solution: cost_bound = solution.gval + heur_fn(solution) else: return False while time_left > 0: init_time = os.times()[0] improved_solution = se.search( timebound=time_left, costbound=(float("inf"), float("inf"), cost_bound) ) finish_time = os.times()[0] time_left -= finish_time - init_time if improved_solution: cost_bound = improved_solution.gval + heur_fn(improved_solution) solution = improved_solution else: break return solution
def do_POST(self): ''' This function sends search results ''' # TIME HAS STARTED start_time = time.time() form = FieldStorage(fp=self.rfile, headers=self.headers, environ={'REQUEST_METHOD':'POST'}) query = str(form.getvalue('query')) limit = form.getvalue("limit") if not limit: limit = 3 else: limit = int(limit) offset = form.getvalue("offset") if not offset or int(offset) < 0: offset = 0 else: offset = int(offset) doc_act = form.getvalue("action") if doc_act == "back" and offset != 0: offset = offset - limit elif doc_act == "forward": offset = offset + limit elif doc_act == "to the beginning": offset = 0 # field, button and query self.send_response(250) self.send_header("Content-type", "text/html; charset=utf-8") self.end_headers() self.wfile.write(bytes(""" <html> <body> <form method="post"> <input type="text" name="query" value="%s"/> <input type="submit" name="search" value="Search"/> <br> <br> <label for="limit"> Docs per page <input type="number" name="limit" placeholder="limit" value="%d"/> </label> <input type="hidden" name="offset" placeholder="offset"value="%d"/> """ % (query, limit, offset), encoding="utf-8")) # my list of (doclim,docset) pairs docs_list = [] for num in range(limit+1): # print('I am reading offsets and limits for quotes') quote_act = form.getvalue("action%d" % num) doclim = form.getvalue('doc%dlim' % num) # print(doclim, 'doclim') docset = form.getvalue('doc%dset' % num) # print(docset,'docset') if not doclim or doclim == "None": doclim = 3 else: doclim = int(doclim) if not docset or docset == "None": docset = 0 else: docset = int(docset) if docset < 0: docset = 0 # print('I am reading conditions for quote buttons') # надо чтобы при перелистывании на другую страницу, # оффсет и лимит для цитат сбрасывались до базовых # если я на 1 стр перелистнула цитаты, # то кнопка "назад" для цитат стала активной # но когда я перелистываю на 2 стр я хочу, # чтобы кнопка назад для цитат с новой стр, # которые я еще НЕ листала, была НЕ активной if doc_act == "forward": docset = 0 if quote_act == "back" and docset != 0: docset = docset - doclim # print(docset, 'docset for back') elif quote_act == "forward": docset = docset + doclim # print(docset, 'docset for forward') elif quote_act == "to the beginning": docset = 0 # print(docset,'docset as it is') # print(doclim, 'doclim as it is') # я добавляю к лимиту единицу, это чтобы листать цитаты # (если есть еще одна после лимита, то можно листать, иначе - кнопка не горит!!! и врут календари))) docs_list.append((doclim+1,docset)) num += 1 print(docs_list,'docs_list') my_search = SearchEngine('TolstoyDataBase') # print(query) # еще одна пара, чтобы искать следущий документ doc_limof = [] for pair in docs_list: doc_limof.append(pair) doc_limof.append((3,0)) print(doc_limof,'doc_limof') # здесь лимит по цитатам + 1 final = my_search.qulim_search_modified(query, 1, limit+1, offset, doc_limof) # условия горения кнопок по документам print(offset, 'offset') if offset == 0: self.wfile.write(bytes(""" <input type="submit" name="action" value="to the beginning" disabled/> <input type="submit" name="action" value="back"disabled/>""", encoding="UTF-8")) else: self.wfile.write(bytes(""" <input type="submit" name="action" value="to the beginning"/> <input type="submit" name="action" value="back"/>""", encoding="UTF-8")) print(len(final), 'len of final') if len(final.keys()) < limit +1: self.wfile.write(bytes(""" <input type="submit" name="action" value="forward" disabled/>""", encoding="UTF-8")) else: self.wfile.write(bytes(""" <input type="submit" name="action" value="forward"/>""", encoding="UTF-8")) # the beginning of ordered list self.wfile.write(bytes('<ol>', encoding="utf-8")) if not final: self.wfile.write(bytes('NOT FOUND, SORRY', encoding="utf-8")) # делаю срез, чтобы взять лимит минус 1 результатов, лимит+1 результат не надо показывать, он в уме for number,filename in enumerate(sorted(final)[:limit]): # create limit and offset for each document for it to have it's personal ones quote_lim = doc_limof[number][0] quote_offset = doc_limof[number][1] self.wfile.write(bytes('<li><p>%s</p>' % filename, encoding ="utf-8")) self.wfile.write(bytes(""" <label for="doc%dlim"> Quotes per doc <input type="number" name="doc%dlim" value="%d"/> </label> <input type="hidden" name="doc%dset" value="%d"/> """% (number, number, quote_lim-1, number, quote_offset), encoding="utf-8")) # условия горения кнопок по цитатам print(quote_offset,'quote_offset') if quote_offset == 0: self.wfile.write(bytes(""" <input type="submit" name="action%d" value="to the beginning"disabled/> <input type="submit" name="action%d" value="back"disabled/>""" %(number,number), encoding="UTF-8")) else: self.wfile.write(bytes(""" <input type="submit" name="action%d" value="to the beginning"/> <input type="submit" name="action%d" value="back"/>""" %(number,number), encoding="UTF-8")) print(len(final[filename]),'len(final[filename])') print(quote_lim, 'quote_lim') print(limit,'limit') print(offset,'offset') # quote_lim на самом деле уже содержит +1, поэтому еще раз прибавлять не надо if len(final[filename]) < quote_lim: self.wfile.write(bytes(""" <input type="submit" name="action%d" value="forward"disabled/>""" %number, encoding="UTF-8")) elif len(final[filename]) >= quote_lim: self.wfile.write(bytes(""" <input type="submit" name="action%d" value="forward"/>""" %number, encoding="UTF-8")) # the beginning of unordered list self.wfile.write(bytes('<ul>', encoding="utf-8")) # вывожу цитаты до лимита по цитатам - 1 for num, quote in enumerate (final[filename][:-1]): self.wfile.write(bytes('<li><p>%s</p></li>' % quote, encoding="utf-8")) self.wfile.write(bytes('</ul>', encoding="utf-8")) self.wfile.write(bytes("""</ol</form></body></html>""", encoding="utf-8")) print('time:', time.time() - start_time)
def setUp(self): """ Setup search engine that will be subjected to the tests. """ self.engine = SearchEngine()
class BookInventory(object): """Class representing a inventory of books. Args: filename (str): File name containing book inventory data. Attributes: filename (str): File name containing book inventory data. indexer (Indexer): Object responsible for indexing book inventory data. """ _BOOK_META_ID_INDEX = 0 _BOOK_META_TITLE_INDEX = 1 _BOOK_META_AUTHOR_INDEX = 2 _NO_RESULTS_MESSAGE = 'Sorry, no results.' def __init__(self, filename): self.filename = filename self.engine = SearchEngine() @timed def load_books(self): """Load books from a file name. This method leverages the iterable behavior of File objects that automatically uses buffered IO and memory management handling effectively large files. """ logger.info('Loading books from file...') processor = BookDataPreprocessor() with open(self.filename) as catalog: for entry in catalog: book_desc = processor.preprocess(entry) metadata = ' '.join(book_desc[self._BOOK_META_TITLE_INDEX:]) iid = book_desc[self._BOOK_META_ID_INDEX].strip() title = book_desc[self._BOOK_META_TITLE_INDEX].strip() author = book_desc[self._BOOK_META_AUTHOR_INDEX].strip() book = Book(iid, title, author, metadata) self.engine.add_object(book) self.engine.start() @timed def search_books(self, query, n_results=10): """Search books according to provided query of terms. The query is executed against the indexed books, and a list of books compatible with the provided terms is return along with their tf-idf score. Args: query (str): Query string with one or more terms. n_results (int): Desired number of results. Returns: list of IndexableResult: List containing books and their respective tf-idf scores. """ result = '' if len(query) > 0: result = self.engine.search(query, n_results) if len(result) > 0: return '\n'.join([str(indexable) for indexable in result]) return self._NO_RESULTS_MESSAGE def books_count(self): """Return number of books already in the index. Returns: int: Number of books indexed. """ return self.engine.count()
def __init__(self, filename): self.filename = filename self.engine = SearchEngine()
class SearchEngineTests(unittest.TestCase): """ Test case for SearchEngine class. """ def setUp(self): """ Setup search engine that will be subjected to the tests. """ self.engine = SearchEngine() def test_indexed_doc_count(self): """ Test if the number of indexed object is retrieved correctly. """ sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") self.build_sample_index([sample1, sample2, sample3]) self.assertEqual(self.engine.count(), 3) def test_existent_term_search(self): """ Test if search is correctly performed. """ sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") self.build_sample_index([sample1, sample2, sample3]) expected_results = [ IndexableResult(1.414214, sample1), IndexableResult(0.906589, sample2), IndexableResult(0.906589, sample3), ] results = self.engine.search("indexable metadata") self.assertListEqual(results, expected_results) def test_non_existent_term_search(self): """ Test if search is correctly performed. """ sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") self.build_sample_index([sample1, sample2, sample3]) expected_results = [] results = self.engine.search("asdasdasdas") self.assertListEqual(results, expected_results) def test_search_result_limit(self): """ Test if search results can be limited. """ sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") self.build_sample_index([sample1, sample2, sample3]) expected_results = [ IndexableResult(1.414214, sample1), ] results = self.engine.search("indexable metadata", 1) self.assertListEqual(results, expected_results) def build_sample_index(self, objects): for indexable in objects: self.engine.add_object(indexable) self.engine.start()
class wordInventory(object): """Class representing a inventory of words. Args: filename (str): File name containing word inventory data. Attributes: filename (str): File name containing word inventory data. engine (SearchEngine): Object responsible for indexing word inventory data. """ _NO_RESULTS_MESSAGE = 'Sorry, no results.' def __init__(self, filename): self.filename = filename self.engine = SearchEngine() # self.engine2 = SearchEngine() @timed def init_engine(self, isFromFile=True, isBinaryWord=False): """Load words from a file name. This method leverages the iterable behavior of File objects that automatically uses buffered IO and memory management handling effectively large files. """ # print isFromFile if isFromFile: self.loadFromeFile(isBinaryWord) else: logger.info('Loading words from file...') iid = 1 for parent, dirnames, fnames in os.walk(self.filename): for fname in fnames: fname2 = './Reuters/' + fname # print fname word = open(fname2).read() # temp = fname.rstrip('.html').split('-') # if len(temp)<=1: # continue # singer = temp[0] # title = temp[1] # metadata = singer + ' ' + title # wordobject = Word(iid, title, singer,word) wordobject = Word(iid, word, isBinaryWord) # songobject = SongInfo(iid,title,singer,metadata) self.engine.add_object(wordobject) # self.engine2.add_object(songobject) iid += 1 self.engine.start(isBinaryWord) # self.engine2.start() self.saveToFile(isBinaryWord) @timed def search_words(self, query, n_results=10, choice=2, SYSNONYM=False): """Search words according to provided query of terms. The query is executed against the indexed words, and a list of words compatible with the provided terms is return along with their tf-idf score. Args: query (str): Query string with one or more terms. n_results (int): Desired number of results. Returns: list of IndexableResult: List containing words and their respective tf-idf scores. """ result = '' # dictionary = self.engine.index.term_index.keys() if len(query) > 0: # checkSpelling(query, dictionary) parent, dirnames, fnames = list(os.walk(self.filename))[0] if choice == 1: result = self.engine.search_bool(query, n_results, SYSNONYM) for res in result: print res, " ", fnames[res] elif choice == 2: result = self.engine.search(query, n_results, SYSNONYM) for res in result: print res.indexable.iid - 1, " ", fnames[res.indexable.iid - 1], " ", res.score # print len(list(os.walk(self.filename))) # print if len(result) > 0: # return '\n'.join([str(indexable) for indexable in result]) return return self._NO_RESULTS_MESSAGE # def search_info(self, query, n_results=10): # """Search song information according to provided query of terms. # The query is executed against the indexed words, and a list of words # compatible with the provided terms is return along with their tf-idf # score. # Args: # query (str): Query string with one or more terms. # n_results (int): Desired number of results. # Returns: # list of IndexableResult: List containing words and their respective # tf-idf scores. # """ # result = '' # if len(query) > 0: # result = self.engine2.search(query, n_results) # if len(result) > 0: # return '\n'.join([str(indexable) for indexable in result]) # return self._NO_RESULTS_MESSAGE def saveToFile(self, isBinaryWord): if isBinaryWord: fileObject = open('test.engine', 'w') else: fileObject = open('test_noBinary.engine', 'w') pickle.dump(self.engine, fileObject) # @timed def loadFromeFile(self, isBinaryWord=False): # print isBinaryWord if isBinaryWord: fileObject = open('test.engine', 'r') else: fileObject = open('test_noBinary.engine', 'r') self.engine = pickle.load(fileObject) def words_count(self): """ Returns: int: Number of words indexed. """ return self.engine.count()
class FrontierManager(): """ Frontier Manager seeds in request form """ # seeds to start crawling seeds = [] # links to crawl links = [] # links crawled links_done = [] # /! Will have to go in a Frontera Middleware at some point # weighted links to crawl weighted_links = None # weighted links crawled weighted_links_done = [] # weighted ignore_seeds = [] # weighted ignored_pages = [] requests = [] requests_done = [] max_n_requests = 10 searchengine = None crawl_book = None url_base = '' # /! def __init__(self, settings=SETTINGS, seeds=SETTINGS['SEEDS']): def __init__(self, seeds=[]): """ init with seeds Init with seeds Create/Open a file for storing progress """ #self.settings = settings url = next(iter(SEEDS)) parsed_uri = urlparse(url) self.url_base = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) self.searchengine = SearchEngine() self.searchengine.db_connect() self.crawl_book = CrawlWorkbook(path=WWW_DIR, url=seeds[0].url) self.crawl_book.wb_open() # /! Will have to go in a Frontera Middleware at some point # retrieve weighted_links, weighted_links_done... self.weighted_links = self.crawl_book.weighted_links self.weighted_links_done = self.crawl_book.weighted_links_done self.ignore_seeds = self.crawl_book.ignore_seeds self.ignored_pages = self.crawl_book.ignored_pages self.add_seeds(seeds) # build requests from weighted_links for wl in self.weighted_links: self.requests.append(requests.Request(url=wl.url)) for wl in self.weighted_links_done: self.requests_done.append(requests.Request(url=wl.url)) # ignore list ignore_suffixes = [ '/es/', '/fr/', '/ca/', '/newsletters', '/2021/', '/2020/01/', '/2020/02/', '/2020/03/', '/2020/04/', '/2020/05/', '/2020/06/', '/2020/07/', '/2020/08/', '/2020/09/', '/2020/10/', '/2019/', '/2018/', '/2017/', '/2016/', '/2015/', '/2014/', '/section/world', '/video/world', '/section/food', '/section/arts', '/section/sports', '/section/science', '/section/books', '/section/travel', '/section/realestate', '/section/fashion', '/section/technology', '/section/politics', '/section/business', '/section/style', '/section/well', '/section/style/love', '/section/us', '/section/video', '/section/interactive', '/section/magazine', '/international', '/section/t-magazine', '/section/live', '/live', '/video', '/interactive', '/issue/fashion', '/subscription', '/subscriptions', '/section/business/dealbook', '/pages/business/dealbook', '/privacy' ] if not self.ignore_seeds: self.ignore_seeds = [ WeightedLink(url=urljoin(self.url_base, suffix)) for suffix in ignore_suffixes ] self.crawl_book.ws_writerows( WORKBOOK['crawler']['worksheet']['ignoreseeds']['TITLE'], self.ignore_seeds) def add_seeds(self, seeds): """ add seeds /! not append """ self.seeds = seeds if self.weighted_links is None: self.weighted_links = [ WeightedLink(url=seed.url) for seed in self.seeds ] if self.weighted_links is not None and len(self.weighted_links) == 0: self.weighted_links = [ WeightedLink(url=seed.url) for seed in self.seeds ] def request_error(request, error_code): """ TODO """ pass def start(self): # should open workbook as well self.searchengine.db_connect() def stop(self): # should save workbook, maybe init values with as well self.searchengine.db_close() def finished(self): """ Quick check if crawling is finished. Called pretty often, please make sure calls are lightweight. """ return not self.weighted_links def page_save_to_file(self, request, soup): """ save page to a #.html file where # is the title crc32 TODO: save request """ # eg. /var/www/html/apple.com/256.html url_hash = SearchEngine.hash_url(request.url) file_name = os.path.join(HTML_DIR, urlsplit(request.url).netloc, str(url_hash) + '.html') print(file_name) os.makedirs(os.path.join(HTML_DIR, urlsplit(request.url).netloc), exist_ok=True) f = open(file_name, 'w') f.write(soup.prettify()) f.close() def page_crawled(self, response): """ This method is called every time a page has been crawled. """ self.requests_done.append(response.request) self.requests = [ req for req in self.requests if req.url != response.request.url ] html_doc = response.text soup = BeautifulSoup(html_doc, 'html.parser') title = u'' if soup is not None: titles = soup.find('title') if titles is not None: title = titles.string # extract first weighted link matchinq response.request.url wl = next( (x for x in self.weighted_links if x.url == response.request.url), None) if wl: self.crawl_book.ws_writeln( WORKBOOK['crawler']['worksheet']['crawledpages']['TITLE'], wl) self.crawl_book.wb_save() # update weighted_links from resquests self.weighted_links = [ wl for wl in self.weighted_links if wl.url != response.request.url ] self.crawl_book.ws_writerows( WORKBOOK['crawler']['worksheet']['tocrawlpages']['TITLE'], self.weighted_links) self.page_save_to_file(request=response.request, soup=soup) print('Frontier: ', len(self.requests), 'pages to crawl -', len(self.requests_done), 'crawled pages -', len(self.ignored_pages), 'ignored pages') def get_next_requests(self, max_n_requests=MAX_N_REQUESTS): """ Returns a list of next urls to be crawled. Parameters: max_next_requests (int) – Maximum number of urls to be returned by this method. Returns: list of weighted links. """ # return first max_n_requests links return self.requests[:max_n_requests] #return self.weighted_links[:max_n_requests] def in_ignore_seeds(self, link): """ returns True if link (request) is in self.ignore_seeds """ return next( (x for x in self.ignore_seeds if link.url.startswith(x.url)), None) def in_ignored_pages(self, link): """ returns True if link (request) is in self.ignored_pages """ return next((x for x in self.ignored_pages if x.url == link.url), None) def links_extracted(self, request, links): """ add links to crawl found in response (from request) """ print('Frontier: links_extracted') for req in links: already_there = False if self.in_ignore_seeds(req): if not self.in_ignored_pages(req): self.ignored_pages.append(WeightedLink(url=req.url)) else: # extract first request matchinq request.url inreqs = next((x for x in self.requests if x.url == req.url), None) if not inreqs: # extract first request matchinq request.url inreqsdone = next( (x for x in self.requests_done if x.url == req.url), None) if not inreqsdone: self.requests.append(req) self.weighted_links.append(WeightedLink(url=req.url)) wbwsname = WORKBOOK['crawler']['worksheet']['tocrawlpages']['TITLE'] self.crawl_book.ws_writerows(wbwsname, self.weighted_links) wbwsname = WORKBOOK['crawler']['worksheet']['ignoredpages']['TITLE'] self.crawl_book.ws_writerows(wbwsname, self.ignored_pages)
from flask import Flask, render_template, request from flask_bootstrap import Bootstrap from data_source import DataSource from search import SearchEngine import urllib.parse app = Flask(__name__) bootstrap = Bootstrap(app) ds = DataSource() se = SearchEngine(ds) @app.route('/', methods=['GET']) def index(): return render_template('index_jinja.html') @app.route('/search', methods=['GET']) def search(): # get query query = request.args.get('query') if query is None: query = '' query = urllib.parse.unquote(query) # search result search_result = se.search(query)
from recipe import Recipe, load_recipe_from_json from pathlib import Path from recipe_index import RecipeIndex from search import SearchEngine from suggestions import SuggestionEngine import traceback app = Flask(__name__, static_url_path=None) # TODO How can we make constants configurable? REACT_BUILD_DIR = 'react-js/build' DEFAULT_IMAGE_DIR = f'{str(Path.home())}/data/recipe-data/images' DEFAULT_CONFIG_DIR = f'{str(Path.home())}/data/recipe-data/config' INDEX = RecipeIndex(DEFAULT_CONFIG_DIR) SEARCHER = SearchEngine(INDEX) SUGGESTER = SuggestionEngine(INDEX) @app.route('/', defaults={'path': 'index.html'}) @app.route('/<path:path>') def catch_all(path): return send_from_directory(REACT_BUILD_DIR, path) @app.route('/recipe/<id>') def render_recipe(id): return send_from_directory(REACT_BUILD_DIR, 'index.html') @app.route('/images/<image_name>')
def run(self): self.m_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.m_socket.bind((self.m_ip, self.m_port)) while True: self.m_socket.listen(3) connect, address = self.m_socket.accept() data = connect.recv(1024) request_lists = self.handle_http_request(data) self.handle_http_request(data) source = '/index.html' if request_lists[0][1] != '/': source = request_lists[0][1] if '.html' in source: self.content += 'Content-Type: text/html\r\n\r\n' elif '.css' in source: self.content += 'Content-Type: text/css\r\n\r\n' elif '.js' in source: self.content += 'Content-Type: text/js\r\n\r\n' elif '.jpg' in source: self.content += 'Content-Type: image/jpg\r\n\r\n' elif '.png' in source: self.content += 'Content-Type: image/png\r\n\r\n' else: self.content += 'Content-Type: text/html\r\n\r\n' else: self.content += 'Content-Type: text/html\r\n\r\n' source = '/index.html' print source string = "" if "?" in source: key_words = source[4:] print key_words key_words = key_words.replace("%", ' ') key_words = key_words.split(" ") key_words = key_words[1:] print key_words k = "".join(map(lambda x: chr(int(x, 16)), key_words)) print k a = SearchEngine('F:\search_engine\json_data\\').run(k) url = "http://job.cqupt.edu.cn/#rec:" print a for i in a: string += "<p>" + "<a href=" + url + i[0][:3] + ">" + i[ 0][:3] + "</a>" + "</p>" string = '<html><title>result</title>' + string + "</html>" source = '/index.html' try: print os.getcwd() + '/www' + source fp = open(os.getcwd() + '/www' + source, 'r') f = fp.read() fp.close() if len(string) > 1: f = string connect.sendall(self.content + f) except: print "not found" fp = open(os.getcwd() + '/www' + '/404.html', 'r') f = fp.read() fp.close() if len(string) > 1: f = string connect.sendall(self.content + f) self.content = 'HTTP/1.x 200 OK\r\n' connect.close()
from django.shortcuts import render_to_response from django.http import HttpResponse from search import SearchEngine import json # Create your views here. g_Se = SearchEngine() def home(request): return render_to_response("index.html") def search(request): keyword = request.GET.get('keyword', '') tweets = g_Se.search(keyword) return HttpResponse(json.dumps({"keyword": keyword, "tweets": tweets})) def search_range(request): lat = request.GET.get('lat', '') lon = request.GET.get('lon', '') tweets = g_Se.search_range(float(lat), float(lon)) return HttpResponse(json.dumps({"tweets": tweets}))
class wordInventory(object): """Class representing a inventory of words. Args: filename (str): File name containing word inventory data. Attributes: filename (str): File name containing word inventory data. engine (SearchEngine): Object responsible for indexing word inventory data. """ _NO_RESULTS_MESSAGE = 'Sorry, no results.' def __init__(self, filename): self.filename = filename self.engine = SearchEngine() self.engine2 = SearchEngine() @timed def load_words(self): """Load words from a file name. This method leverages the iterable behavior of File objects that automatically uses buffered IO and memory management handling effectively large files. """ logger.info('Loading words from file...') iid = 1 for parent,dirnames,fnames in os.walk(self.filename): for fname in fnames: fname2 = './Reuters/' + fname # print fname word = open(fname2).read() # temp = fname.rstrip('.html').split('-') # if len(temp)<=1: # continue # singer = temp[0] # title = temp[1] # metadata = singer + ' ' + title # wordobject = Word(iid, title, singer,word) wordobject = Word(iid, word) # songobject = SongInfo(iid,title,singer,metadata) self.engine.add_object(wordobject) # self.engine2.add_object(songobject) iid+=1 self.engine.start() # self.engine2.start() self.saveToFile() @timed def search_words(self, query, n_results=10): """Search words according to provided query of terms. The query is executed against the indexed words, and a list of words compatible with the provided terms is return along with their tf-idf score. Args: query (str): Query string with one or more terms. n_results (int): Desired number of results. Returns: list of IndexableResult: List containing words and their respective tf-idf scores. """ result = '' # dictionary = self.engine.index.term_index.keys() if len(query) > 0: # checkSpelling(query, dictionary) result = self.engine.search(query, n_results) print result if len(result) > 0: # return '\n'.join([str(indexable) for indexable in result]) return return self._NO_RESULTS_MESSAGE # def search_info(self, query, n_results=10): # """Search song information according to provided query of terms. # The query is executed against the indexed words, and a list of words # compatible with the provided terms is return along with their tf-idf # score. # Args: # query (str): Query string with one or more terms. # n_results (int): Desired number of results. # Returns: # list of IndexableResult: List containing words and their respective # tf-idf scores. # """ # result = '' # if len(query) > 0: # result = self.engine2.search(query, n_results) # if len(result) > 0: # return '\n'.join([str(indexable) for indexable in result]) # return self._NO_RESULTS_MESSAGE def saveToFile(self): fileObject = open('test.engine','w') pickle.dump(self.engine, fileObject) def words_count(self): """ Returns: int: Number of words indexed. """ return self.engine.count()
def __init__(self, filename): self.filename = filename self.engine = SearchEngine() self.engine2 = SearchEngine()
class SearchEngineTests(unittest.TestCase): """ Test case for SearchEngine class. """ def setUp(self): """ Setup search engine that will be subjected to the tests. """ self.engine = SearchEngine() def test_indexed_doc_count(self): """ Test if the number of indexed object is retrieved correctly. """ sample1 = Indexable(1, 'this is an indexable metadata') sample2 = Indexable(2, 'this is an indexable super metadata') sample3 = Indexable(3, 'this is another indexable metadata') self.build_sample_index([sample1, sample2, sample3]) self.assertEqual(self.engine.count(), 3) def test_existent_term_search(self): """ Test if search is correctly performed. """ sample1 = Indexable(1, 'this is an indexable metadata') sample2 = Indexable(2, 'this is an indexable super metadata') sample3 = Indexable(3, 'this is another indexable metadata') self.build_sample_index([sample1, sample2, sample3]) expected_results = [ IndexableResult(1.414214, sample1), IndexableResult(0.906589, sample2), IndexableResult(0.906589, sample3), ] results = self.engine.search('indexable metadata') self.assertListEqual(results, expected_results) def test_non_existent_term_search(self): """ Test if search is correctly performed. """ sample1 = Indexable(1, 'this is an indexable metadata') sample2 = Indexable(2, 'this is an indexable super metadata') sample3 = Indexable(3, 'this is another indexable metadata') self.build_sample_index([sample1, sample2, sample3]) expected_results = [] results = self.engine.search('asdasdasdas') self.assertListEqual(results, expected_results) def test_search_result_limit(self): """ Test if search results can be limited. """ sample1 = Indexable(1, 'this is an indexable metadata') sample2 = Indexable(2, 'this is an indexable super metadata') sample3 = Indexable(3, 'this is another indexable metadata') self.build_sample_index([sample1, sample2, sample3]) expected_results = [ IndexableResult(1.414214, sample1), ] results = self.engine.search('indexable metadata', 1) self.assertListEqual(results, expected_results) def build_sample_index(self, objects): for indexable in objects: self.engine.add_object(indexable) self.engine.start()