Пример #1
0
 def show_main_menu():
     while True:
         print(
             '\n'
             'What would you like to do:\n'
             '   (1) List statistics\n'
             '   (2) Display 3 cities with longest names\n'
             '   (3) Display county\'s name with the largest number of communities\n'
             '   (4) Display locations, that belong to more than one category\n'
             '   (5) Advanced search\n'
             '   (0) Exit program\n')
         option = input('Choose option: ')
         if option == '1':
             PTPrint.list_statistics_print(SearchEngine.list_statistics())
         elif option == '2':
             PTPrint.cities_with_longest_names_print(
                 SearchEngine.cities_with_longest_names(3))
         elif option == '3':
             PTPrint.largest_number_of_communities_print(
                 SearchEngine.largest_number_of_communities(4))
         elif option == '4':
             PTPrint.more_then_one_cat_loc_print(
                 SearchEngine.more_than_one_category_locations())
         elif option == '5':
             Menu.advanced_search()
         elif option == '0':
             return False
         else:
             'There is no such option in menu'
Пример #2
0
    def setUp(self):
        self.strr = 'sun window tree apple, juse border films 23 45good'
        #            01234567890123456789012345678901234567890123456789
        #            0         1         2         3         4
        self.strr2 = 'Мы тестируем нашу программу для работы с окнами. '
        #             01234567890123456789012345678901234567890123456789
        #             0         1         2         3         4
        self.strr3 = 'Первая строка для тестов.\n'
        #              01234567890123456789012345678901234567890123456789
        #              0         1         2         3         4

        self.strr4 = 'Вторая строка для тестов.'
        #              01234567890123456789012345678901234567890123456789
        #              0         1         2         3         4

        self.test_file = open('test_window_one.txt', 'w')
        self.test_file.write(self.strr)
        self.test_file.close()

        self.test_file = open('test_window_two.txt', 'w')
        self.test_file.write(self.strr2)
        self.test_file.close()

        self.test_file = open('test_window_three.txt', 'w')
        self.test_file.write(self.strr3)
        self.test_file.write(self.strr4)
        self.test_file.close()

        self.x = SearchEngine("test_db")
        self.x.database.update(idealdict)
Пример #3
0
 def setUp(self):
     self.engine = SearchEngine('db_name')
     self.engine.database.update(database)
     with open("test1.txt", 'w') as file:
         file.write(test1)
     with open("test2.txt", 'w') as file:
         file.write(test2)
class TestMySearchEngine(unittest.TestCase):
    def setUp(self):
        self.x = SearchEngine("database")
        self.x.database.update(idealdict)

    def test_searchengine_type(self):
        result = self.x.search("round")
        self.assertIsInstance(result, dict)

    def test_MyError_type_number(self):
        with self.assertRaises(ValueError):
            self.x.search(15)

    def test_empty_string(self):
        result = self.x.search('')
        self.assertIsInstance(result, dict)
        self.assertEqual(result, {})

    def test_search_by_token(self):
        result = self.x.search('The')
        self.assertIsInstance(result, dict)
        self.assertEqual(result, idealdict['The'])

    def tearDown(self):
        del self.x
        for f in os.listdir('.'):
            if f.startswith('database.'):
                os.remove(f)
Пример #5
0
class MASLsTest(unittest.TestCase):
    def setUp(self):
        self.x = SearchEngine()

    def tearDown(self):
        pass

    def test_list_empty(self):
        result = self.x.merge_and_sort_lists([])
        wanted = []
        self.assertEqual(list(result), wanted)

    def test_list_empty2(self):
        result = self.x.merge_and_sort_lists([[], [], []])
        wanted = []
        self.assertEqual(list(result), wanted)

    def test_1_list_int(self):
        result = self.x.merge_and_sort_lists([[9, 10, 11]])
        wanted = [9, 10, 11]
        self.assertEqual(list(result), wanted)

    def test_2_list_int(self):
        result = self.x.merge_and_sort_lists([[1, 2, 3], [9, 10, 11]])
        wanted = [1, 2, 3, 9, 10, 11]
        self.assertEqual(list(result), wanted)

    def test_3_list_int(self):
        result = self.x.merge_and_sort_lists([[1, 2, 3], [9, 10, 11],
                                              [4, 5, 6]])
        wanted = [1, 2, 3, 4, 5, 6, 9, 10, 11]
        self.assertEqual(list(result), wanted)

    def test_3_list_int_dif_len(self):
        result = self.x.merge_and_sort_lists([[1, 2, 3],
                                              [9, 10, 11, 12, 13, 14, 15],
                                              [4, 5, 6, 7, 8]])
        wanted = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
        self.assertEqual(list(result), wanted)

    def test_3_list_int_empty(self):
        result = self.x.merge_and_sort_lists([[1, 2, 3], [], [4, 5, 6, 7, 8]])
        wanted = [1, 2, 3, 4, 5, 6, 7, 8]
        self.assertEqual(list(result), wanted)

    def test_1_list_abc(self):
        result = self.x.merge_and_sort_lists([['c', 'a', 'b']])
        wanted = ['c', 'a', 'b']
        self.assertEqual(list(result), wanted)

    def test_3_list_abc_dif_len(self):
        result = self.x.merge_and_sort_lists([['a', 'b', 'c'],
                                              ['d', 'f', 'g', 'h']])
        wanted = ['a', 'b', 'c', 'd', 'f', 'g', 'h']
        self.assertEqual(list(result), wanted)
def main():

    win = SearchEngine('database')
    while True:
        findstr = input("Слово для поиска: ")
        if findstr == "exit":
            break

        res = win.find_supplemented_window(findstr, 2)  #2 - deth of window
        for k in res:  #k - key
            print(k)
            for v in res[k]:
                print(v)  #v- meaning from the list
                print(v.get_BB_string())
def cli(method):
    """
    A search engine that searches tweets related to USA elections 2020 given a query.
    """

    se = SearchEngine(method)

    return 0
Пример #8
0
 def advanced_search():
     while True:
         print('What would you like to do:\n'
               '   (1) Search by name\n'
               '   (2) Search by type\n'
               '   (3) Search by district\n'
               '   (0) Exit search\n')
         option = input('Choose option: ')
         if option == '0':
             return False
         search = input('Search for:')
         if option == '1':
             PTPrint.advanced_search_print(
                 SearchEngine.advanced_search_by_name(search))
         elif option == '2':
             PTPrint.advanced_search_print(
                 SearchEngine.advanced_search_by_type(search))
         elif option == '3':
             PTPrint.advanced_search_print(
                 SearchEngine.advanced_search_by_district(search))
class TestForMultiWordSearch(unittest.TestCase):
    def setUp(self):
        self.x = SearchEngine("database")
        self.x.database.update(idealdict)

    def test_searchengine_type(self):
        result = self.x.multiple_search("round")
        self.assertIsInstance(result, dict)

    def test_MyError_type_number(self):
        with self.assertRaises(ValueError):
            self.x.multiple_search(15)

    def test_empty_string(self):
        result = self.x.multiple_search('')
        self.assertIsInstance(result, dict)
        self.assertEqual(result, {})

    def test_search_by_token(self):
        result = self.x.multiple_search('The')
        self.assertIsInstance(result, dict)
        self.assertEqual(result, idealdict['The'])

    def test_search_two_tokens(self):
        result = self.x.multiple_search('The sun')
        self.assertIsInstance(result, dict)
        self.assertEqual(result,
                         {'file2.txt': [Position(1, 4, 0),
                                        Position(5, 8, 0)]})

    def tearDown(self):
        del self.x
        for f in os.listdir('.'):
            if f.startswith('database.'):
                os.remove(f)
Пример #10
0
  def __init__(self, data_dir, feature_type, survey):
    data_dir  = os.path.normpath(data_dir)
    data_name = os.path.basename(data_dir)

    # Search Engine
    lex_file        = os.path.join(data_dir,data_name + '.lex')
    background_file = os.path.join(data_dir,data_name + '.background')
    inv_index_file  = os.path.join(data_dir, data_name + '.index')
    doclengs_file   = os.path.join(data_dir, data_name + '.doclength')

    self.searchengine = SearchEngine(
                                    lex_file        = lex_file,
                                    background_file = background_file,
                                    inv_index_file  = inv_index_file,
                                    doclengs_file   = doclengs_file
                                )

    # State Machine
    self.statemachine = StateMachine(
                              background    = self.searchengine.background,
                              inv_index     = self.searchengine.inv_index,
                              doclengs      = self.searchengine.doclengs,
                              data_dir      = data_dir,
                              feat          = feature_type
                              )

    # Action Manager
    self.actionmanager = ActionManager(
                              background  = self.searchengine.background,
                              doclengs    = self.searchengine.doclengs,
                              data_dir    = data_dir,
                              survey      = survey
                              )

    # Training or Testing
    self.test_flag = True
Пример #11
0
    def __init__(self, database, window_len, files=None, path=None):

        self.files = files  #  список файлов для анализа
        self.path = path  #  папка с текстовыми файлами
        self.window_len = window_len
        self.db_name = database

        indexator = Indexer(self.db_name)

        file_list = []

        if self.files is not None:
            file_list.extend(self.files)

        if self.path is not None:
            for f in os.listdir(path=self.path):
                file_list.append(self.path + f)

        for p in file_list:
            print("Indexing file: ", p)
            indexator.prescribe_index(p)
        del indexator

        SearchEngine.__init__(self, database)
Пример #12
0
def _main():
    try:
        _setup_logging()
        command_line_arguments = _parse_command_line_arguments()
        cell_values = PuzzleParser.read_from_file(
            command_line_arguments.input_file)
        search_summary = SearchEngine.find_solution(
            cell_values, command_line_arguments.algorithm,
            command_line_arguments.timeout_sec)
        _print_search_summary(search_summary)
        if command_line_arguments.output_file is not None:
            GridFormatter.write_to_file(search_summary.final_grid,
                                        command_line_arguments.output_file)
    except InvalidInputError as e:
        print("Failed to parse the input file {0}: {1}".format(
            command_line_arguments.input_file, e))
    except (InvalidPuzzleError, NoSuchAlgorithmError) as e:
        print("Puzzle rejected by the search engine: {0}".format(e))
    except (FileNotFoundError, IsADirectoryError, PermissionError) as e:
        print("I/O error: {0}".format(e))
    except (ValueError, RuntimeError) as e:
        print("Unexpected error has occured: {0}".format(e))
    finally:
        print()
Пример #13
0
    regex = args.regex
    method_search_state = MethodSearch.all
    method_sorting = SortMethod.abc
    sortOrder = SortOrder.asc
    stat_method_state = MethodStat.count

    if args.count_mathces_arg and args.unique_mathces_arg:
        method_search_state = MethodSearch.unique_count
    else:
        if args.unique_mathces_arg:
            method_search_state = MethodSearch.unique
        if args.count_mathces_arg:
            method_search_state = MethodSearch.count
    if args.count_line_mathces_arg:
        method_search_state = MethodSearch.line

    if args.sort_method_arg:
        method_sorting = sort_method_map[args.sort_method_arg]

    if args.sort_order_arg:
        sortOrder = sort_order_map[args.sort_order_arg]

    sorter = Sorter(method_sorting, sortOrder)

    if args.stat_arg:
        stat_engine = StatEngine(regex, stat_method_map[args.stat_arg], sorter)
        print(stat_engine.begin(data_strings, args.num_print_rows_arg))
    else:
        search_engine = SearchEngine(regex, method_search_state, sorter)
        print(search_engine.begin(data_strings, args.num_print_rows_arg))
Пример #14
0
for x, y in opts:
    if x == '-d':
        dictionary_file = y
    elif x == '-p':
        postings_file = y
    elif x == '-q':
        query_file = y
    elif x == '-o':
        results_file = y
    else:
        raise AssertionError('unhandled option')

if dictionary_file == None or postings_file == None or query_file == None or results_file == None:
    print(f'usage: {sys.argv[0]} -d dictionary-file -p postings-file -q file-of-queries -o output-file-of-results')
    sys.exit(2)

document_file = 'document.txt'
dictionary = load_dictionary(dictionary_file)
documents = load_documents(document_file)
search_engine = SearchEngine(dictionary, documents, postings_file)
query, relevant_doc_ids = read_query(query_file)

with open(results_file, 'w') as f:
    f.seek(0)
    try:
        result = search_engine.search(query, relevant_doc_ids)
        f.write(' '.join([str(i) for i in result]) + '\n')
    except ParseError as e:
        f.write(f'parse error encountered: {e}')

Пример #15
0
class TestSearchEngine(unittest.TestCase):
    def setUp(self):
        self.engine = SearchEngine('db_name')
        self.engine.database.update(database)
        with open("test1.txt", 'w') as file:
            file.write(test1)
        with open("test2.txt", 'w') as file:
            file.write(test2)

    def test_empty(self):
        result = self.engine.search_one('')
        self.assertEqual(result, {})

    def test_search_one(self):
        result = self.engine.search_one('test')
        self.assertEqual(
            result, {
                'test1.txt': [Position_with_lines(11, 15, 0)],
                'test2.txt': [Position_with_lines(3, 7, 0)]
            })

    def test_search_many_one(self):
        result = self.engine.search_many('test')
        self.assertEqual(
            result, {
                'test1.txt': [Position_with_lines(11, 15, 0)],
                'test2.txt': [Position_with_lines(3, 7, 0)]
            })

    def test_search_many_two(self):
        result = self.engine.search_many('my test')
        self.assertEqual(
            result, {
                'test1.txt': [
                    Position_with_lines(8, 10, 0),
                    Position_with_lines(11, 15, 0)
                ],
                'test2.txt':
                [Position_with_lines(0, 2, 0),
                 Position_with_lines(3, 7, 0)]
            })

    def test_search_limit_offset_default(self):
        result = self.engine.search_limit_offset('test')
        self.assertEqual(result, {'test1.txt': [], 'test2.txt': []})

    def test_search_limit_offset_all(self):
        result = self.engine.search_limit_offset('test',
                                                 doclimit=2,
                                                 docoffset=0,
                                                 limits=[2, 2],
                                                 offsets=[0, 0])
        self.assertEqual(
            result, {
                'test1.txt': ['this is my <strong>test</strong>'],
                'test2.txt': ['my <strong>test</strong>']
            })

    def test_search_limit_offset_one(self):
        result = self.engine.search_limit_offset('test',
                                                 doclimit=1,
                                                 docoffset=0,
                                                 limits=[2, 2],
                                                 offsets=[0, 0])
        self.assertEqual(result, {
            'test1.txt': ['this is my <strong>test</strong>'],
            'test2.txt': []
        })

    def test_search_limit_offset_shift(self):
        result = self.engine.search_limit_offset('test',
                                                 doclimit=2,
                                                 docoffset=1,
                                                 limits=[2, 2],
                                                 offsets=[0, 0])
        self.assertEqual(result, {
            'test1.txt': [],
            'test2.txt': ['my <strong>test</strong>']
        })

    def test_search_many_limit_offset_one(self):
        result = self.engine.search_many_limit_offset('test',
                                                      limit=1,
                                                      offset=0,
                                                      limits=[2, 2],
                                                      offsets=[0, 0])
        self.assertEqual(result,
                         {'test1.txt': [Position_with_lines(11, 15, 0)]})

    def test_search_many_limit_offset_shift(self):
        result = self.engine.search_many_limit_offset('test',
                                                      limit=1,
                                                      offset=1,
                                                      limits=[2, 2],
                                                      offsets=[0, 0])
        self.assertEqual(result, {'test2.txt': [Position_with_lines(3, 7, 0)]})

    def test_search_many_limit_offset_all(self):
        result = self.engine.search_many_limit_offset('test',
                                                      limit=2,
                                                      offset=0,
                                                      limits=[2, 2],
                                                      offsets=[0, 0])
        self.assertEqual(
            result, {
                'test1.txt': [Position_with_lines(11, 15, 0)],
                'test2.txt': [Position_with_lines(3, 7, 0)]
            })

    def test_generator(self):
        result = self.engine.generator(
            [[Position_with_lines(12, 13, 1),
              Position_with_lines(3, 7, 0)],
             [Position_with_lines(11, 15, 0),
              Position_with_lines(3, 7, 0)], []])
        a = []
        for r in result:
            a.append(r)
        self.assertEqual(a, [
            Position_with_lines(11, 15, 0),
            Position_with_lines(3, 7, 0),
            Position_with_lines(12, 13, 1),
            Position_with_lines(3, 7, 0)
        ])

    def test_search_many_limit_offset_gen_one(self):
        result = self.engine.search_many_limit_offset_gen('test',
                                                          limit=1,
                                                          offset=0,
                                                          limits=[2, 2],
                                                          offsets=[0, 0])
        result_keys = list(result.keys())
        self.assertEqual(result_keys, ['test1.txt'])
        for key in result.keys():
            for data in result[key]:
                self.assertEqual(data, Position_with_lines(11, 15, 0))

    def test_search_many_limit_offset_gen_shift(self):
        result = self.engine.search_many_limit_offset_gen('test',
                                                          limit=1,
                                                          offset=1,
                                                          limits=[2, 2],
                                                          offsets=[0, 0])
        result_keys = list(result.keys())
        self.assertEqual(result_keys, ['test2.txt'])
        for key in result.keys():
            for data in result[key]:
                self.assertEqual(data, Position_with_lines(3, 7, 0))

    def test_search_many_limit_offset_gen_all(self):
        result = self.engine.search_many_limit_offset_gen('test',
                                                          limit=2,
                                                          offset=0,
                                                          limits=[2, 2],
                                                          offsets=[0, 0])
        result_keys = list(result.keys())
        self.assertEqual(result_keys, ['test1.txt', 'test2.txt'])
        for key in result.keys():
            for data in result[key]:
                self.assertEqual(data, database['test'][key][0])

    def tearDown(self):
        del self.engine
        for filename in os.listdir(os.getcwd()):
            if filename == 'db_name' or filename.startswith('db_name'):
                os.remove(filename)
        if 'test1.txt' in os.listdir(os.getcwd()):
            os.remove('test1.txt')
        if 'test2.txt' in os.listdir(os.getcwd()):
            os.remove('test2.txt')
Пример #16
0
from searchengine import SearchEngine
from tokenstore import TokenStore


# Todo: reformat command line interface
def command_line(arg=None):
    while True:
        raw_query = input("Please type in what do you want to search: ").lower()

        if " --" in raw_query and not arg:
            raw_query, arg = raw_query.split(" --")

        query = re.sub("[^A-Za-z0-9 ]+", "", raw_query)

        if len(query) > 32:
            print("the query is too long, please shorten the search word")
        else:
            return query.split(), arg


if __name__ == '__main__':
    store = TokenStore()
    search_engine = SearchEngine(store)

    signal.signal(signal.SIGINT, lambda _s, _f: exit(0))

    while True:
        search_queries, arg = command_line()
        search_engine.search(*[search_queries, False] if arg == "all" else [search_queries])
Пример #17
0
class WindowsTest(unittest.TestCase):
    def setUp(self):
        self.strr = 'sun window tree apple, juse border films 23 45good'
        #            01234567890123456789012345678901234567890123456789
        #            0         1         2         3         4
        self.strr2 = 'Мы тестируем нашу программу для работы с окнами. '
        #             01234567890123456789012345678901234567890123456789
        #             0         1         2         3         4
        self.strr3 = 'Первая строка для тестов.\n'
        #              01234567890123456789012345678901234567890123456789
        #              0         1         2         3         4

        self.strr4 = 'Вторая строка для тестов.'
        #              01234567890123456789012345678901234567890123456789
        #              0         1         2         3         4

        self.test_file = open('test_window_one.txt', 'w')
        self.test_file.write(self.strr)
        self.test_file.close()

        self.test_file = open('test_window_two.txt', 'w')
        self.test_file.write(self.strr2)
        self.test_file.close()

        self.test_file = open('test_window_three.txt', 'w')
        self.test_file.write(self.strr3)
        self.test_file.write(self.strr4)
        self.test_file.close()

        self.x = SearchEngine("test_db")
        self.x.database.update(idealdict)

    def tearDown(self):
        del self.x
        file_list = os.listdir(path=".")
        for i in file_list:
            if i == 'test_window_one.txt' or i == 'test_window_two.txt' or i == 'test_window_three.txt':
                os.remove(i)
            if i.startswith('test_db.'):
                os.remove(i)

    def test_wrong_input_error(self):
        with self.assertRaises(ValueError):
            files = ['test_window_one.txt']
            win = self.x.find_supplemented_window_lim_v3(
                files, 3, 0, 1, [(0, 1)])

    def test_absent_key(self):
        result = self.x.find_supplemented_window_lim_v3(
            'zzzz', 1, 0, 1, [(0, 1)])
        self.assertEqual(result, {})

    def test_empty_string(self):
        result = self.x.find_supplemented_window_lim_v3('', 1, 0, 1, [(0, 1)])
        self.assertIsInstance(result, dict)
        self.assertEqual(result, {})

    def test_get_window_begin(self):
        result = self.x.find_supplemented_window_lim_v3(
            'sun', 1, 0, 1, [(0, 1)])
        res = result['test_window_one.txt'][0]
        win = TokenWindow(self.strr, [Position(0, 3, 0)], 0, 50)
        ideal = {'test_window_one.txt': [win]}
        self.assertEqual(res.allString, win.allString)
        self.assertEqual(res, win)
        self.assertEqual(result, ideal)

    def test_get_window_simple(self):
        result = self.x.find_supplemented_window_lim_v3(
            'tree', 2, 0, 1, [(0, 1)])
        res = result['test_window_one.txt'][0]
        win = TokenWindow(self.strr, [Position(11, 15, 0)], 0, 50)
        ideal = {'test_window_one.txt': [win]}
        self.assertEqual(res.allString, win.allString)
        self.assertEqual(res, win)
        self.assertEqual(result, ideal)

    def test_get_window_end(self):
        result = self.x.find_supplemented_window_lim_v3(
            'good', 1, 0, 1, [(0, 1)])
        res = result['test_window_one.txt'][0]
        win = TokenWindow(self.strr, [Position(46, 50, 0)], 0, 50)
        ideal = {'test_window_one.txt': [win]}
        self.assertEqual(res.allString, win.allString)
        self.assertEqual(res, win)
        self.assertEqual(result, ideal)

    def test_get_window_simple2(self):
        result = self.x.find_supplemented_window_lim_v3(
            'нашу', 2, 0, 1, [(0, 1)])
        res = result['test_window_two.txt'][0]
        win = TokenWindow(self.strr2, [Position(13, 17, 0)], 0, 49)
        ideal = {'test_window_two.txt': [win]}
        self.assertEqual(res.allString, win.allString)
        self.assertEqual(res, win)
        self.assertEqual(result, ideal)

    def test_get_window_simple_two_line(self):
        result = self.x.find_supplemented_window_lim_v3(
            'Вторая', 1, 0, 1, [(0, 1)])
        res = result['test_window_three.txt'][0]
        win = TokenWindow(self.strr4, [Position(0, 6, 1)], 0, 25)
        ideal = {'test_window_three.txt': [win]}
        self.assertEqual(res.allString, win.allString)
        self.assertEqual(res, win)
        self.assertEqual(result, ideal)

    def test_get_window_two_result(self):
        result = self.x.find_supplemented_window_lim_v3(
            'тестов', 1, 0, 1, [(0, 2)])
        res1 = result['test_window_three.txt'][0]
        res2 = result['test_window_three.txt'][1]
        win1 = TokenWindow(self.strr3.replace('\n', ''), [Position(18, 24, 0)],
                           0, 25)
        win2 = TokenWindow(self.strr4, [Position(18, 24, 1)], 0, 25)
        ideal = {'test_window_three.txt': [win1, win2]}
        self.assertEqual(res1.allString, win1.allString)
        self.assertEqual(res1, win1)
        self.assertEqual(res2.allString, win2.allString)
        self.assertEqual(res2, win2)
        self.assertEqual(result, ideal)

    def test_get_window_two_result2(self):
        result = self.x.find_supplemented_window_lim_v3(
            'тестов', 1, 0, 1, [(0, 1)])
        res1 = result['test_window_three.txt'][0]
        win1 = TokenWindow(self.strr3.replace('\n', ''), [Position(18, 24, 0)],
                           0, 25)
        ideal = {'test_window_three.txt': [win1]}
        self.assertEqual(res1.allString, win1.allString)
        self.assertEqual(res1, win1)
        self.assertEqual(result, ideal)

    def test_get_window_two_result3(self):
        result = self.x.find_supplemented_window_lim_v3(
            'тестов', 1, 0, 1, [(1, 2)])
        res1 = result['test_window_three.txt'][0]
        win1 = TokenWindow(self.strr4.replace('\n', ''), [Position(18, 24, 1)],
                           0, 25)
        ideal = {'test_window_three.txt': [win1]}
        self.assertEqual(res1.allString, win1.allString)
        self.assertEqual(res1, win1)
        self.assertEqual(result, ideal)

    def test_get_window_two_word(self):
        result = self.x.find_supplemented_window_lim_v3(
            'Мы работы', 2, 0, 1, [(0, 2)])
        res = result['test_window_two.txt'][0]
        win = TokenWindow(
            self.strr2,
            [Position(0, 2, 0), Position(32, 38, 0)], 0, 49)
        ideal = {'test_window_two.txt': [win]}
        self.assertEqual(res.allString, win.allString)
        self.assertEqual(res, win)
        self.assertEqual(result, ideal)

    def test_get_window_wrong_offset(self):
        result = self.x.find_supplemented_window_lim_v3(
            'tree', 2, 0, 1, [(2, 1)])
        res = result['test_window_one.txt']
        ideal = {'test_window_one.txt': []}
        self.assertEqual(res, [])
        self.assertEqual(result, ideal)
Пример #18
0
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus1.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus2.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus3.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus4.pkl'
    ) + p.get_corpus(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/corpus5.pkl'
    )
    cat_data = pd.DataFrame(
        columns=['id', 'tokens', 'category', 'category_code', 'pred_category'])
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation1_final.pkl',
        cat_data)
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation2_final.pkl',
        cat_data)
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation3_final.pkl',
        cat_data)
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation4_final.pkl',
        cat_data)
    cat_data = t.get_categorised_data(
        'D:/9th semester/Information Retrieval Lab/package/scrapper/data/data_categorisation5_final.pkl',
        cat_data)
    engine = SearchEngine(model, corpus, cat_data)
    print('Done!')
    application.run()
Пример #19
0
class DialogueManager(object):
  def __init__(self, data_dir, feature_type, survey):
    data_dir  = os.path.normpath(data_dir)
    data_name = os.path.basename(data_dir)

    # Search Engine
    lex_file        = os.path.join(data_dir,data_name + '.lex')
    background_file = os.path.join(data_dir,data_name + '.background')
    inv_index_file  = os.path.join(data_dir, data_name + '.index')
    doclengs_file   = os.path.join(data_dir, data_name + '.doclength')

    self.searchengine = SearchEngine(
                                    lex_file        = lex_file,
                                    background_file = background_file,
                                    inv_index_file  = inv_index_file,
                                    doclengs_file   = doclengs_file
                                )

    # State Machine
    self.statemachine = StateMachine(
                              background    = self.searchengine.background,
                              inv_index     = self.searchengine.inv_index,
                              doclengs      = self.searchengine.doclengs,
                              data_dir      = data_dir,
                              feat          = feature_type
                              )

    # Action Manager
    self.actionmanager = ActionManager(
                              background  = self.searchengine.background,
                              doclengs    = self.searchengine.doclengs,
                              data_dir    = data_dir,
                              survey      = survey
                              )

    # Training or Testing
    self.test_flag = True

  def __call__(self, query, ans, test_flag = False):
    """
      Set inital parameters for every session

      Return:
        state(firstpass): 1 dim vector
    """

    self.query = query
    #self.ans   = ( None if test_flag else ans )
    self.ans   = ans

    # Interaction Parameters, action and current turn number
    self.cur_action  = -1 # Action None
    self.curtHorizon = 0

    # Previous retrieved results and MAP
    self.ret     = None

    self.lastMAP = 0.
    self.MAP     = 0.

    # Termination indicator
    self.terminal = False

    # Training or Testing
    self.test_flag = test_flag

  def gen_state_feature(self):
    assert self.actionmanager.posmodel != None

    # Search Engine Retrieves Result
    self.ret = self.searchengine.retrieve( self.actionmanager.posmodel,\
                                            self.actionmanager.negmodel )

    self.curtHorizon += 1

    # Feature Extraction
    feature = self.statemachine(  ret         = self.ret,
                                  action_type = self.cur_action,
                                  curtHorizon = self.curtHorizon,

                                  posmodel    = self.actionmanager.posprior,
                                  negmodel    = self.actionmanager.negprior,
                                  posprior    = self.actionmanager.posprior,
                                  negprior    = self.actionmanager.negprior  )

    # Record mean average precision
    self.lastMAP = self.MAP
    self.MAP = self.evalAP(self.ret,self.ans)

    return feature

  def request(self,action_type):
    '''

      Sends request to simulator for more query info

    '''
    self.cur_action = action_type
    request = {}
    request['ret']    = self.ret
    request['action'] = self.actionmanager.actionTable[ action_type ]
    return request

  def expand_query(self,response):
    '''

      Passes response to action manager for query expansion

    '''
    posmodel, negmodel = self.actionmanager.expand_query(response)

    self.posmodel = posmodel
    self.negmodel = negmodel

    return posmodel, negmodel

  def evalAP(self,ret,ans):
    tp = [ float(ans.has_key(docID)) for docID,val in ret ]
    atp = np.cumsum(tp)
    precision = [  atp[idx] / (idx+1) * tp[idx] for idx,(docID,val) in enumerate(ret)  ]
    return ( sum(precision)/len(ans) if len(ans) else 0. )

  def evalMAP(self,ret,ans):
    APs = [ evalAP(ret[i],ans[i]) for i in xrange(len(ret)) ]
    print("warning!! MAP")
    return sum(APs)/len(APs)

  def calculate_reward(self):
    if self.terminal:
      reward = self.actionmanager.costTable[ 4 ]
    else:
      reward = self.actionmanager.costTable[ self.cur_action ] +\
               self.actionmanager.costTable['lambda'] * (self.MAP - self.lastMAP)

    return reward

  def show(self):
    self.terminal = True
    params = {'ret': self.ret }
    return params

  def game_over(self):
    if self.terminal or self.curtHorizon >= 5:
      self.query = None
      self.ans   = None
      self.actionmanager.posmodel = None
      return True, self.MAP
    return False, self.MAP
Пример #20
0
            disabled_f_doc = ''
            if limit_doc >= len(res[k]):
                disabled_f_doc = 'disabled'

            disabled_b_doc = ''
            if offset_doc == 0:
                disabled_b_doc = 'disabled'

            for j, v in enumerate(res[k]):
                if j == limit_doc:
                    break
                re += '<li>' + v.get_BB_string() + '</li>'

            re += '</ul>'

            result += form_limit.format(k, str(i), str(offset_doc),
                                        str(limit_doc), re, disabled_f_doc,
                                        disabled_b_doc)

        ret_result = body.format(postvars['findstr'][0], str(offset),
                                 str(limit), result, disabled_f, disabled_b)

        return ret_result


if __name__ == '__main__':
    httpd = HTTPServer(('localhost', 80), custom_handler)
    httpd.search_engine = SearchEngine('database')
    print('Start server.')
    httpd.serve_forever()
Пример #21
0
 def setUp(self):
     self.x = SearchEngine()
Пример #22
0
 def find_solution(puzzle, algorithm, timeout_sec = 10):
     cell_values = PuzzleParser.read_from_string(puzzle)
     search_summary = SearchEngine.find_solution(cell_values, algorithm, timeout_sec)
     formatted_final_grid = GridFormatter.write_to_string(search_summary.final_grid)
     return (search_summary, formatted_final_grid)
Пример #23
0
import sys

from flask import request
from flask_api import FlaskAPI

sys.path.append(os.path.abspath("../indexer/src"))
from searchengine import SearchEngine
from tokenstore import TokenStore

app = FlaskAPI(__name__)


@app.route("/")
def index():
    return app.send_static_file('html/index.html')


@app.route("/api/search", methods=["POST"])
def search() -> list:
    """
    curl -X POST http://127.0.0.1:5000/api/search -d  queries="hello world"
    """
    return search_engine.search(request.data.get("queries").lower().split(","))


if __name__ == "__main__":
    store = TokenStore()
    search_engine = SearchEngine(store)

    app.run(debug=True, host="0.0.0.0")
Пример #24
0
 def do_POST(self):
     """
     POST handler for query
     """
     try:
         content_length = int(self.headers['Content-Length'])
         body = str(self.rfile.read(content_length))
         print("body = " + body)
         query, limit, offset, limits, offsets, action, action_doc, action_exists = self.parse_url(
             body)
         print("query = " + query)
         print("doclimit = " + limit)
         print("docoffset = " + offset)
         print("action = " + action)
         print("actiondoc = " + action_doc)
         if action_exists:
             offsets = self.get_new_offset_limit(action, action_doc,
                                                 offsets, limits)
         print('limits = ' + str(limits))
         print('offsets = ' + str(offsets))
         search_engine = SearchEngine('database')
         r = search_engine.search_limit_offset(query, 4, limit, offset,
                                               limits, offsets)
         myresp = ''
         myresp += 'Documents Limit<br><input type="text" name="limit" value="' + str(
             limit) + '"><br>'
         myresp += 'Documents Offset<br><input type="text" name="offset" value="' + str(
             offset) + '"><br>'
         key_list = list(r.keys())
         key_list.sort()
         j = 0
         for key in key_list:
             myresp += '<ol>\n'
             myresp += '<li>' + key + '</li>\n<ul>'
             myresp += 'Limit<br><input type="text" name="doc' + str(
                 j) + 'limit" value="' + limits[j] + '"><br>'
             myresp += 'Offset<br><input type="text" name=doc' + str(
                 j) + 'offset" value="' + offsets[j] + '"><br>'
             myresp += '<input type="submit" name=action' + str(
                 j) + ' value="perv">'
             myresp += '<input type="submit" name=action' + str(
                 j) + ' value="back">'
             myresp += '<input type="submit" name=action' + str(
                 j) + ' value="next"> <br>'
             for val in r[key]:
                 myresp += '<li>' + val + '</li>'
             myresp += '</ul>'
             j = j + 1
             myresp += '</ol>'
         self.send_response(200)
         self.send_header("Content-type", "text/html; charset=utf-8")
         self.end_headers()
         self.wfile.write(
             bytes((resp + data.format(query, myresp)), "utf-8"))
     except TypeError:
         response = 'fields "limit" and "offset" can not take a negative or fractional values'
         self.wfile.write(bytes((resp + data.format('', response)),
                                "utf-8"))
     except Exception as ex:
         response = '<br>Uuups. Something went wrong. Error message: ' + str(
             ex) + '<br>'
         self.send_response(200)
         self.send_header("Content-type", "text/html; charset=utf-8")
         self.end_headers()
         files = os.listdir(".\\")
         i = 0
         response += 'Documents Limit<br><input type="text" name="limit" value="0"><br>'
         response += 'Documents Offset<br><input type="text" name="offset" value="0"><br>'
         for f in files:
             if re.match(".*\.txt", f):
                 response += (f + "<br>")
                 response += 'Limit<br><input type="text" name=doc' + str(
                     i) + 'limit value="0"><br>'
                 response += 'Offset<br><input type="text" name=doc' + str(
                     i) + 'offset value="0"><br>'
                 response += '<input type="submit" name=action' + str(
                     i) + ' value="perv">'
                 response += '<input type="submit" name=action' + str(
                     i) + ' value="back">'
                 response += '<input type="submit" name=action' + str(
                     i) + ' value="next"> <br>'
                 i = i + 1
         self.wfile.write(
             bytes((resp + data.format('', 'Not Found<br>' + response)),
                   "utf-8"))
Пример #25
0
class WindowsTest(unittest.TestCase):
    def setUp(self):
        self.strr = 'sun window tree apple, juse border films 23 45good'
        #            01234567890123456789012345678901234567890123456789
        #            0         1         2         3         4
        self.strr2 = 'Мы тестируем нашу программу для работы с окнами. '
        #             01234567890123456789012345678901234567890123456789
        #             0         1         2         3         4
        self.strr3 = 'Первая строка для тестов.\n'
        #              01234567890123456789012345678901234567890123456789
        #              0         1         2         3         4

        self.strr4 = 'Вторая строка для тестов.'
        #              01234567890123456789012345678901234567890123456789
        #              0         1         2         3         4

        self.test_file = open('test_window_one.txt', 'w')
        self.test_file.write(self.strr)
        self.test_file.close()

        self.test_file = open('test_window_two.txt', 'w')
        self.test_file.write(self.strr2)
        self.test_file.close()

        self.test_file = open('test_window_three.txt', 'w')
        self.test_file.write(self.strr3)
        self.test_file.write(self.strr4)
        self.test_file.close()

        self.x = SearchEngine("test_db")
        self.x.database.update(idealdict)

    def tearDown(self):
        del self.x
        file_list = os.listdir(path=".")
        for i in file_list:
            if i == 'test_window_one.txt' or i == 'test_window_two.txt' or i == 'test_window_three.txt':
                os.remove(i)
            if i.startswith('test_db.'):
                os.remove(i)

    def test_wrong_input_error(self):
        with self.assertRaises(ValueError):
            files = ['test_window_one.txt']
            win = self.x.multiple_search_lim(3, 0, 1)

    def test_absent_key(self):
        result = self.x.multiple_search_lim('zzzz', 0, 1)
        self.assertEqual(result, {})

    def test_empty_string(self):
        result = self.x.multiple_search_lim('', 0, 1)
        self.assertIsInstance(result, dict)
        self.assertEqual(result, {})

    def test_get_window_begin(self):
        result = self.x.multiple_search_lim('sun', 0, 1)
        res = result['test_window_one.txt'][0]
        t = [Position(0, 3, 0)]
        ideal = {'test_window_one.txt': t}
        self.assertEqual(result, ideal)

    def test_get_window_simple(self):
        result = self.x.multiple_search_lim('tree', 0, 1)
        res = result['test_window_one.txt'][0]
        t = [Position(11, 15, 0)]
        ideal = {'test_window_one.txt': t}
        self.assertEqual(result, ideal)

    def test_get_window_end(self):
        result = self.x.multiple_search_lim('good', 0, 1)
        res = result['test_window_one.txt'][0]
        t = [Position(46, 50, 0)]
        ideal = {'test_window_one.txt': t}
        self.assertEqual(result, ideal)

    def test_get_window_simple2(self):
        result = self.x.multiple_search_lim('нашу', 0, 1)
        res = result['test_window_two.txt'][0]
        t = [Position(13, 17, 0)]
        ideal = {'test_window_two.txt': t}
        self.assertEqual(result, ideal)

    def test_get_window_simple_two_line(self):
        result = self.x.multiple_search_lim('Вторая', 0, 1)
        res = result['test_window_three.txt'][0]
        t = [Position(0, 6, 1)]
        ideal = {'test_window_three.txt': t}
        self.assertEqual(result, ideal)

    def test_get_window_two_result(self):
        result = self.x.multiple_search_lim('тестов', 0, 1)
        res1 = result['test_window_three.txt'][0]
        res2 = result['test_window_three.txt'][1]
        t = [Position(18, 24, 0), Position(18, 24, 1)]
        ideal = {'test_window_three.txt': t}
        self.assertEqual(result, ideal)

    def test_get_window_two_result3(self):
        result = self.x.multiple_search_lim('тестов', 0, 1)
        res1 = result['test_window_three.txt'][0]
        t = [Position(18, 24, 0), Position(18, 24, 1)]
        ideal = {'test_window_three.txt': t}
        self.assertEqual(result, ideal)

    def test_get_window_wrong_offset(self):
        result = self.x.multiple_search_lim('tree', 5, 2)
        ideal = {}
        self.assertEqual(result, ideal)
 def setUp(self):
     self.x = SearchEngine("database")
     self.x.database.update(idealdict)
Пример #27
0
from flask import Flask, request
from flask import jsonify
from flask_cors import CORS
from io import BytesIO
import json
from searchengine import SearchEngine
from retriever import Retriever
from transformers import BertTokenizer, BertModel
from ranker import BertRanker, Word2vecRanker

app = Flask(__name__)
cors = CORS(app, resources={r'/get': {"origins": "*"}})
se = SearchEngine()

with open('../cache/word_to_pos_count2.json') as f:
    word_to_pos = json.load(f)
with open('../cache/pos_hanzi.json') as f:
    pos_to_hanzi = json.load(f)
hanzi_to_pos = {hanzi:pos for pos, hanzi in pos_to_hanzi.items()}
hanzi_to_pos['不限'] = 'all'
print('finish load...')
initialPos = ['名词', '人名', '地名', '机构名', '其它专名', '数词', '量词', '数量词', '时间词', '方位词', '处所词', '动词', '形容词', '副词', '前接成分', '后接成分', '习语', '简称', '代词', '连词', '介词', '助词', '语气助词', '叹词', '拟声词', '语素', '标点', '其它']
bert = BertModel.from_pretrained("hfl/chinese-bert-wwm-ext")
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")
bertranker = BertRanker()
word2vecranker = Word2vecRanker("/data/disk2/private/hujinyi/IRHomework/cache/sgns.renmin.word")

@app.route('/')
def hello_world():
    return 'Hello World!'