Exemplo n.º 1
0
 def setUp(self):
     self.engine = SearchEngine('db_name')
     self.engine.database.update(database)
     with open("test1.txt", 'w') as file:
         file.write(test1)
     with open("test2.txt", 'w') as file:
         file.write(test2)
Exemplo n.º 2
0
    def setUp(self):
        self.strr = 'sun window tree apple, juse border films 23 45good'
        #            01234567890123456789012345678901234567890123456789
        #            0         1         2         3         4
        self.strr2 = 'Мы тестируем нашу программу для работы с окнами. '
        #             01234567890123456789012345678901234567890123456789
        #             0         1         2         3         4
        self.strr3 = 'Первая строка для тестов.\n'
        #              01234567890123456789012345678901234567890123456789
        #              0         1         2         3         4

        self.strr4 = 'Вторая строка для тестов.'
        #              01234567890123456789012345678901234567890123456789
        #              0         1         2         3         4

        self.test_file = open('test_window_one.txt', 'w')
        self.test_file.write(self.strr)
        self.test_file.close()

        self.test_file = open('test_window_two.txt', 'w')
        self.test_file.write(self.strr2)
        self.test_file.close()

        self.test_file = open('test_window_three.txt', 'w')
        self.test_file.write(self.strr3)
        self.test_file.write(self.strr4)
        self.test_file.close()

        self.x = SearchEngine("test_db")
        self.x.database.update(idealdict)
def cli(method):
    """
    A search engine that searches tweets related to USA elections 2020 given a query.
    """

    se = SearchEngine(method)

    return 0
def main():

    win = SearchEngine('database')
    while True:
        findstr = input("Слово для поиска: ")
        if findstr == "exit":
            break

        res = win.find_supplemented_window(findstr, 2)  #2 - deth of window
        for k in res:  #k - key
            print(k)
            for v in res[k]:
                print(v)  #v- meaning from the list
                print(v.get_BB_string())
Exemplo n.º 5
0
  def __init__(self, data_dir, feature_type, survey):
    data_dir  = os.path.normpath(data_dir)
    data_name = os.path.basename(data_dir)

    # Search Engine
    lex_file        = os.path.join(data_dir,data_name + '.lex')
    background_file = os.path.join(data_dir,data_name + '.background')
    inv_index_file  = os.path.join(data_dir, data_name + '.index')
    doclengs_file   = os.path.join(data_dir, data_name + '.doclength')

    self.searchengine = SearchEngine(
                                    lex_file        = lex_file,
                                    background_file = background_file,
                                    inv_index_file  = inv_index_file,
                                    doclengs_file   = doclengs_file
                                )

    # State Machine
    self.statemachine = StateMachine(
                              background    = self.searchengine.background,
                              inv_index     = self.searchengine.inv_index,
                              doclengs      = self.searchengine.doclengs,
                              data_dir      = data_dir,
                              feat          = feature_type
                              )

    # Action Manager
    self.actionmanager = ActionManager(
                              background  = self.searchengine.background,
                              doclengs    = self.searchengine.doclengs,
                              data_dir    = data_dir,
                              survey      = survey
                              )

    # Training or Testing
    self.test_flag = True
Exemplo n.º 6
0
import sys

from flask import request
from flask_api import FlaskAPI

sys.path.append(os.path.abspath("../indexer/src"))
from searchengine import SearchEngine
from tokenstore import TokenStore

app = FlaskAPI(__name__)


@app.route("/")
def index():
    return app.send_static_file('html/index.html')


@app.route("/api/search", methods=["POST"])
def search() -> list:
    """
    curl -X POST http://127.0.0.1:5000/api/search -d  queries="hello world"
    """
    return search_engine.search(request.data.get("queries").lower().split(","))


if __name__ == "__main__":
    store = TokenStore()
    search_engine = SearchEngine(store)

    app.run(debug=True, host="0.0.0.0")
Exemplo n.º 7
0
    regex = args.regex
    method_search_state = MethodSearch.all
    method_sorting = SortMethod.abc
    sortOrder = SortOrder.asc
    stat_method_state = MethodStat.count

    if args.count_mathces_arg and args.unique_mathces_arg:
        method_search_state = MethodSearch.unique_count
    else:
        if args.unique_mathces_arg:
            method_search_state = MethodSearch.unique
        if args.count_mathces_arg:
            method_search_state = MethodSearch.count
    if args.count_line_mathces_arg:
        method_search_state = MethodSearch.line

    if args.sort_method_arg:
        method_sorting = sort_method_map[args.sort_method_arg]

    if args.sort_order_arg:
        sortOrder = sort_order_map[args.sort_order_arg]

    sorter = Sorter(method_sorting, sortOrder)

    if args.stat_arg:
        stat_engine = StatEngine(regex, stat_method_map[args.stat_arg], sorter)
        print(stat_engine.begin(data_strings, args.num_print_rows_arg))
    else:
        search_engine = SearchEngine(regex, method_search_state, sorter)
        print(search_engine.begin(data_strings, args.num_print_rows_arg))
Exemplo n.º 8
0
for x, y in opts:
    if x == '-d':
        dictionary_file = y
    elif x == '-p':
        postings_file = y
    elif x == '-q':
        query_file = y
    elif x == '-o':
        results_file = y
    else:
        raise AssertionError('unhandled option')

if dictionary_file == None or postings_file == None or query_file == None or results_file == None:
    print(f'usage: {sys.argv[0]} -d dictionary-file -p postings-file -q file-of-queries -o output-file-of-results')
    sys.exit(2)

document_file = 'document.txt'
dictionary = load_dictionary(dictionary_file)
documents = load_documents(document_file)
search_engine = SearchEngine(dictionary, documents, postings_file)
query, relevant_doc_ids = read_query(query_file)

with open(results_file, 'w') as f:
    f.seek(0)
    try:
        result = search_engine.search(query, relevant_doc_ids)
        f.write(' '.join([str(i) for i in result]) + '\n')
    except ParseError as e:
        f.write(f'parse error encountered: {e}')

Exemplo n.º 9
0
 def do_POST(self):
     """
     POST handler for query
     """
     try:
         content_length = int(self.headers['Content-Length'])
         body = str(self.rfile.read(content_length))
         print("body = " + body)
         query, limit, offset, limits, offsets, action, action_doc, action_exists = self.parse_url(
             body)
         print("query = " + query)
         print("doclimit = " + limit)
         print("docoffset = " + offset)
         print("action = " + action)
         print("actiondoc = " + action_doc)
         if action_exists:
             offsets = self.get_new_offset_limit(action, action_doc,
                                                 offsets, limits)
         print('limits = ' + str(limits))
         print('offsets = ' + str(offsets))
         search_engine = SearchEngine('database')
         r = search_engine.search_limit_offset(query, 4, limit, offset,
                                               limits, offsets)
         myresp = ''
         myresp += 'Documents Limit<br><input type="text" name="limit" value="' + str(
             limit) + '"><br>'
         myresp += 'Documents Offset<br><input type="text" name="offset" value="' + str(
             offset) + '"><br>'
         key_list = list(r.keys())
         key_list.sort()
         j = 0
         for key in key_list:
             myresp += '<ol>\n'
             myresp += '<li>' + key + '</li>\n<ul>'
             myresp += 'Limit<br><input type="text" name="doc' + str(
                 j) + 'limit" value="' + limits[j] + '"><br>'
             myresp += 'Offset<br><input type="text" name=doc' + str(
                 j) + 'offset" value="' + offsets[j] + '"><br>'
             myresp += '<input type="submit" name=action' + str(
                 j) + ' value="perv">'
             myresp += '<input type="submit" name=action' + str(
                 j) + ' value="back">'
             myresp += '<input type="submit" name=action' + str(
                 j) + ' value="next"> <br>'
             for val in r[key]:
                 myresp += '<li>' + val + '</li>'
             myresp += '</ul>'
             j = j + 1
             myresp += '</ol>'
         self.send_response(200)
         self.send_header("Content-type", "text/html; charset=utf-8")
         self.end_headers()
         self.wfile.write(
             bytes((resp + data.format(query, myresp)), "utf-8"))
     except TypeError:
         response = 'fields "limit" and "offset" can not take a negative or fractional values'
         self.wfile.write(bytes((resp + data.format('', response)),
                                "utf-8"))
     except Exception as ex:
         response = '<br>Uuups. Something went wrong. Error message: ' + str(
             ex) + '<br>'
         self.send_response(200)
         self.send_header("Content-type", "text/html; charset=utf-8")
         self.end_headers()
         files = os.listdir(".\\")
         i = 0
         response += 'Documents Limit<br><input type="text" name="limit" value="0"><br>'
         response += 'Documents Offset<br><input type="text" name="offset" value="0"><br>'
         for f in files:
             if re.match(".*\.txt", f):
                 response += (f + "<br>")
                 response += 'Limit<br><input type="text" name=doc' + str(
                     i) + 'limit value="0"><br>'
                 response += 'Offset<br><input type="text" name=doc' + str(
                     i) + 'offset value="0"><br>'
                 response += '<input type="submit" name=action' + str(
                     i) + ' value="perv">'
                 response += '<input type="submit" name=action' + str(
                     i) + ' value="back">'
                 response += '<input type="submit" name=action' + str(
                     i) + ' value="next"> <br>'
                 i = i + 1
         self.wfile.write(
             bytes((resp + data.format('', 'Not Found<br>' + response)),
                   "utf-8"))
 def setUp(self):
     self.x = SearchEngine("database")
     self.x.database.update(idealdict)
Exemplo n.º 11
0
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus1.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus2.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus3.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus4.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus5.pkl'
    )
    cat_data = pd.DataFrame(
        columns=['id', 'tokens', 'category', 'category_code', 'pred_category'])
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation1_final.pkl',
        cat_data)
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation2_final.pkl',
        cat_data)
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation3_final.pkl',
        cat_data)
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation4_final.pkl',
        cat_data)
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation5_final.pkl',
        cat_data)
    engine = SearchEngine(model, corpus, cat_data)
    print('Done!')
    application.run()
Exemplo n.º 12
0
 def setUp(self):
     self.x = SearchEngine()
Exemplo n.º 13
0
            disabled_f_doc = ''
            if limit_doc >= len(res[k]):
                disabled_f_doc = 'disabled'

            disabled_b_doc = ''
            if offset_doc == 0:
                disabled_b_doc = 'disabled'

            for j, v in enumerate(res[k]):
                if j == limit_doc:
                    break
                re += '<li>' + v.get_BB_string() + '</li>'

            re += '</ul>'

            result += form_limit.format(k, str(i), str(offset_doc),
                                        str(limit_doc), re, disabled_f_doc,
                                        disabled_b_doc)

        ret_result = body.format(postvars['findstr'][0], str(offset),
                                 str(limit), result, disabled_f, disabled_b)

        return ret_result


if __name__ == '__main__':
    httpd = HTTPServer(('localhost', 80), custom_handler)
    httpd.search_engine = SearchEngine('database')
    print('Start server.')
    httpd.serve_forever()
Exemplo n.º 14
0
from flask import Flask, request
from flask import jsonify
from flask_cors import CORS
from io import BytesIO
import json
from searchengine import SearchEngine
from retriever import Retriever
from transformers import BertTokenizer, BertModel
from ranker import BertRanker, Word2vecRanker

app = Flask(__name__)
cors = CORS(app, resources={r'/get': {"origins": "*"}})
se = SearchEngine()

with open('../cache/word_to_pos_count2.json') as f:
    word_to_pos = json.load(f)
with open('../cache/pos_hanzi.json') as f:
    pos_to_hanzi = json.load(f)
hanzi_to_pos = {hanzi:pos for pos, hanzi in pos_to_hanzi.items()}
hanzi_to_pos['不限'] = 'all'
print('finish load...')
initialPos = ['名词', '人名', '地名', '机构名', '其它专名', '数词', '量词', '数量词', '时间词', '方位词', '处所词', '动词', '形容词', '副词', '前接成分', '后接成分', '习语', '简称', '代词', '连词', '介词', '助词', '语气助词', '叹词', '拟声词', '语素', '标点', '其它']
bert = BertModel.from_pretrained("hfl/chinese-bert-wwm-ext")
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")
bertranker = BertRanker()
word2vecranker = Word2vecRanker("/data/disk2/private/hujinyi/IRHomework/cache/sgns.renmin.word")

@app.route('/')
def hello_world():
    return 'Hello World!'