def setUp(self): self.engine = SearchEngine('db_name') self.engine.database.update(database) with open("test1.txt", 'w') as file: file.write(test1) with open("test2.txt", 'w') as file: file.write(test2)
def setUp(self): self.strr = 'sun window tree apple, juse border films 23 45good' # 01234567890123456789012345678901234567890123456789 # 0 1 2 3 4 self.strr2 = 'Мы тестируем нашу программу для работы с окнами. ' # 01234567890123456789012345678901234567890123456789 # 0 1 2 3 4 self.strr3 = 'Первая строка для тестов.\n' # 01234567890123456789012345678901234567890123456789 # 0 1 2 3 4 self.strr4 = 'Вторая строка для тестов.' # 01234567890123456789012345678901234567890123456789 # 0 1 2 3 4 self.test_file = open('test_window_one.txt', 'w') self.test_file.write(self.strr) self.test_file.close() self.test_file = open('test_window_two.txt', 'w') self.test_file.write(self.strr2) self.test_file.close() self.test_file = open('test_window_three.txt', 'w') self.test_file.write(self.strr3) self.test_file.write(self.strr4) self.test_file.close() self.x = SearchEngine("test_db") self.x.database.update(idealdict)
def cli(method): """ A search engine that searches tweets related to USA elections 2020 given a query. """ se = SearchEngine(method) return 0
def main(): win = SearchEngine('database') while True: findstr = input("Слово для поиска: ") if findstr == "exit": break res = win.find_supplemented_window(findstr, 2) #2 - deth of window for k in res: #k - key print(k) for v in res[k]: print(v) #v- meaning from the list print(v.get_BB_string())
def __init__(self, data_dir, feature_type, survey): data_dir = os.path.normpath(data_dir) data_name = os.path.basename(data_dir) # Search Engine lex_file = os.path.join(data_dir,data_name + '.lex') background_file = os.path.join(data_dir,data_name + '.background') inv_index_file = os.path.join(data_dir, data_name + '.index') doclengs_file = os.path.join(data_dir, data_name + '.doclength') self.searchengine = SearchEngine( lex_file = lex_file, background_file = background_file, inv_index_file = inv_index_file, doclengs_file = doclengs_file ) # State Machine self.statemachine = StateMachine( background = self.searchengine.background, inv_index = self.searchengine.inv_index, doclengs = self.searchengine.doclengs, data_dir = data_dir, feat = feature_type ) # Action Manager self.actionmanager = ActionManager( background = self.searchengine.background, doclengs = self.searchengine.doclengs, data_dir = data_dir, survey = survey ) # Training or Testing self.test_flag = True
import sys from flask import request from flask_api import FlaskAPI sys.path.append(os.path.abspath("../indexer/src")) from searchengine import SearchEngine from tokenstore import TokenStore app = FlaskAPI(__name__) @app.route("/") def index(): return app.send_static_file('html/index.html') @app.route("/api/search", methods=["POST"]) def search() -> list: """ curl -X POST http://127.0.0.1:5000/api/search -d queries="hello world" """ return search_engine.search(request.data.get("queries").lower().split(",")) if __name__ == "__main__": store = TokenStore() search_engine = SearchEngine(store) app.run(debug=True, host="0.0.0.0")
regex = args.regex method_search_state = MethodSearch.all method_sorting = SortMethod.abc sortOrder = SortOrder.asc stat_method_state = MethodStat.count if args.count_mathces_arg and args.unique_mathces_arg: method_search_state = MethodSearch.unique_count else: if args.unique_mathces_arg: method_search_state = MethodSearch.unique if args.count_mathces_arg: method_search_state = MethodSearch.count if args.count_line_mathces_arg: method_search_state = MethodSearch.line if args.sort_method_arg: method_sorting = sort_method_map[args.sort_method_arg] if args.sort_order_arg: sortOrder = sort_order_map[args.sort_order_arg] sorter = Sorter(method_sorting, sortOrder) if args.stat_arg: stat_engine = StatEngine(regex, stat_method_map[args.stat_arg], sorter) print(stat_engine.begin(data_strings, args.num_print_rows_arg)) else: search_engine = SearchEngine(regex, method_search_state, sorter) print(search_engine.begin(data_strings, args.num_print_rows_arg))
for x, y in opts: if x == '-d': dictionary_file = y elif x == '-p': postings_file = y elif x == '-q': query_file = y elif x == '-o': results_file = y else: raise AssertionError('unhandled option') if dictionary_file == None or postings_file == None or query_file == None or results_file == None: print(f'usage: {sys.argv[0]} -d dictionary-file -p postings-file -q file-of-queries -o output-file-of-results') sys.exit(2) document_file = 'document.txt' dictionary = load_dictionary(dictionary_file) documents = load_documents(document_file) search_engine = SearchEngine(dictionary, documents, postings_file) query, relevant_doc_ids = read_query(query_file) with open(results_file, 'w') as f: f.seek(0) try: result = search_engine.search(query, relevant_doc_ids) f.write(' '.join([str(i) for i in result]) + '\n') except ParseError as e: f.write(f'parse error encountered: {e}')
def do_POST(self): """ POST handler for query """ try: content_length = int(self.headers['Content-Length']) body = str(self.rfile.read(content_length)) print("body = " + body) query, limit, offset, limits, offsets, action, action_doc, action_exists = self.parse_url( body) print("query = " + query) print("doclimit = " + limit) print("docoffset = " + offset) print("action = " + action) print("actiondoc = " + action_doc) if action_exists: offsets = self.get_new_offset_limit(action, action_doc, offsets, limits) print('limits = ' + str(limits)) print('offsets = ' + str(offsets)) search_engine = SearchEngine('database') r = search_engine.search_limit_offset(query, 4, limit, offset, limits, offsets) myresp = '' myresp += 'Documents Limit<br><input type="text" name="limit" value="' + str( limit) + '"><br>' myresp += 'Documents Offset<br><input type="text" name="offset" value="' + str( offset) + '"><br>' key_list = list(r.keys()) key_list.sort() j = 0 for key in key_list: myresp += '<ol>\n' myresp += '<li>' + key + '</li>\n<ul>' myresp += 'Limit<br><input type="text" name="doc' + str( j) + 'limit" value="' + limits[j] + '"><br>' myresp += 'Offset<br><input type="text" name=doc' + str( j) + 'offset" value="' + offsets[j] + '"><br>' myresp += '<input type="submit" name=action' + str( j) + ' value="perv">' myresp += '<input type="submit" name=action' + str( j) + ' value="back">' myresp += '<input type="submit" name=action' + str( j) + ' value="next"> <br>' for val in r[key]: myresp += '<li>' + val + '</li>' myresp += '</ul>' j = j + 1 myresp += '</ol>' self.send_response(200) self.send_header("Content-type", "text/html; charset=utf-8") self.end_headers() self.wfile.write( bytes((resp + data.format(query, myresp)), "utf-8")) except TypeError: response = 'fields "limit" and "offset" can not take a negative or fractional values' self.wfile.write(bytes((resp + data.format('', response)), "utf-8")) except Exception as ex: response = '<br>Uuups. Something went wrong. Error message: ' + str( ex) + '<br>' self.send_response(200) self.send_header("Content-type", "text/html; charset=utf-8") self.end_headers() files = os.listdir(".\\") i = 0 response += 'Documents Limit<br><input type="text" name="limit" value="0"><br>' response += 'Documents Offset<br><input type="text" name="offset" value="0"><br>' for f in files: if re.match(".*\.txt", f): response += (f + "<br>") response += 'Limit<br><input type="text" name=doc' + str( i) + 'limit value="0"><br>' response += 'Offset<br><input type="text" name=doc' + str( i) + 'offset value="0"><br>' response += '<input type="submit" name=action' + str( i) + ' value="perv">' response += '<input type="submit" name=action' + str( i) + ' value="back">' response += '<input type="submit" name=action' + str( i) + ' value="next"> <br>' i = i + 1 self.wfile.write( bytes((resp + data.format('', 'Not Found<br>' + response)), "utf-8"))
def setUp(self): self.x = SearchEngine("database") self.x.database.update(idealdict)
'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus1.pkl' ) + p.get_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus2.pkl' ) + p.get_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus3.pkl' ) + p.get_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus4.pkl' ) + p.get_corpus( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus5.pkl' ) cat_data = pd.DataFrame( columns=['id', 'tokens', 'category', 'category_code', 'pred_category']) cat_data = t.get_categorised_data( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation1_final.pkl', cat_data) cat_data = t.get_categorised_data( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation2_final.pkl', cat_data) cat_data = t.get_categorised_data( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation3_final.pkl', cat_data) cat_data = t.get_categorised_data( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation4_final.pkl', cat_data) cat_data = t.get_categorised_data( 'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation5_final.pkl', cat_data) engine = SearchEngine(model, corpus, cat_data) print('Done!') application.run()
def setUp(self): self.x = SearchEngine()
disabled_f_doc = '' if limit_doc >= len(res[k]): disabled_f_doc = 'disabled' disabled_b_doc = '' if offset_doc == 0: disabled_b_doc = 'disabled' for j, v in enumerate(res[k]): if j == limit_doc: break re += '<li>' + v.get_BB_string() + '</li>' re += '</ul>' result += form_limit.format(k, str(i), str(offset_doc), str(limit_doc), re, disabled_f_doc, disabled_b_doc) ret_result = body.format(postvars['findstr'][0], str(offset), str(limit), result, disabled_f, disabled_b) return ret_result if __name__ == '__main__': httpd = HTTPServer(('localhost', 80), custom_handler) httpd.search_engine = SearchEngine('database') print('Start server.') httpd.serve_forever()
from flask import Flask, request from flask import jsonify from flask_cors import CORS from io import BytesIO import json from searchengine import SearchEngine from retriever import Retriever from transformers import BertTokenizer, BertModel from ranker import BertRanker, Word2vecRanker app = Flask(__name__) cors = CORS(app, resources={r'/get': {"origins": "*"}}) se = SearchEngine() with open('../cache/word_to_pos_count2.json') as f: word_to_pos = json.load(f) with open('../cache/pos_hanzi.json') as f: pos_to_hanzi = json.load(f) hanzi_to_pos = {hanzi:pos for pos, hanzi in pos_to_hanzi.items()} hanzi_to_pos['不限'] = 'all' print('finish load...') initialPos = ['名词', '人名', '地名', '机构名', '其它专名', '数词', '量词', '数量词', '时间词', '方位词', '处所词', '动词', '形容词', '副词', '前接成分', '后接成分', '习语', '简称', '代词', '连词', '介词', '助词', '语气助词', '叹词', '拟声词', '语素', '标点', '其它'] bert = BertModel.from_pretrained("hfl/chinese-bert-wwm-ext") tokenizer = BertTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext") bertranker = BertRanker() word2vecranker = Word2vecRanker("/data/disk2/private/hujinyi/IRHomework/cache/sgns.renmin.word") @app.route('/') def hello_world(): return 'Hello World!'