def create_db_index(self, db_name, file_name): """ The method is very much alike with the method 'create_complex_index_from_file', but it adds a computed index to a database rather than to a dict with a complex structure. """ if (not isinstance(db_name, str)) or (not isinstance(file_name, str)): raise ValueError("This method takes two strings as an input: \ names (paths) of a db and a file. ") with shelve.open(db_name, writeback=True) as db: tokenizer = t.Tokenizer() # the same as in the method create_complex_index_from_file with open(file_name, encoding="utf-8") as file: for i, string in enumerate(file): # for each token out of a generator for token in tokenizer.iter_tokenize(string): if token.kind == "alpha" or token.kind == "digit": # internal - is a dict to store positions in one file internal_dict = db.setdefault(token.word, {}) positions_list = internal_dict.setdefault( file.name, []) positions_list.append( File_Position(token.start, token.end, i + 1))
def test_13(self): """ Tests how generator tokenizer works for a regular string with all types of tokens. """ t = my_tokenizer_combined.Tokenizer() result = list(t.iter_tokenize("a string: 12,$,3")) self.assertEqual(len(result), 10) self.assertEqual(result[0].word, "a") self.assertEqual(result[0].kind, "alpha") self.assertEqual(result[1].word, " ") self.assertEqual(result[1].kind, "space") self.assertEqual(result[1].length, 1) self.assertEqual(result[3].word, ":") self.assertEqual(result[3].kind, "punct") self.assertEqual(result[3].length, 1) self.assertEqual(result[5].word, "12") self.assertEqual(result[5].kind, "digit") self.assertEqual(result[5].length, 2) self.assertEqual(result[7].word, "$") self.assertEqual(result[7].kind, "other") self.assertEqual(result[7].length, 1) self.assertIsInstance(result[0], my_tokenizer_combined.Advanced_Token) self.assertIsInstance(result[1], my_tokenizer_combined.Advanced_Token) self.assertIsInstance(result[3], my_tokenizer_combined.Advanced_Token) self.assertIsInstance(result[5], my_tokenizer_combined.Advanced_Token) self.assertIsInstance(result[7], my_tokenizer_combined.Advanced_Token)
def create_complex_index_from_file(self, path): """ The method takes a name (a path) of a file as an input and creates a dict with a token as a key and a file and a list of its 'file' positions as a value. """ if not isinstance(path, str): raise ValueError("This method takes a string containing \ the name (a path) of a file as an input.") our_dict = {} tokenizer = t.Tokenizer() with open(path, encoding="utf-8") as file: for i, string in enumerate(file): # for each token out of a generator for token in tokenizer.iter_tokenize(string): if token.kind == "alpha" or token.kind == "digit": # internal - is a dict to store positions in one file internal_dict = our_dict.setdefault(token.word, {}) positions_list = internal_dict.setdefault( file.name, []) positions_list.append( File_Position(token.start, token.end, i + 1)) return our_dict
def create_long_index_from_file(self, path): """ The method takes a name (a path) of a file as an input and creates a dict with a token as a key and a list of its 'long' positions as a value. """ if not isinstance(path, str): raise ValueError("This method takes a string containing \ the name (a path) of a file as an input.") our_dict = {} with open(path, encoding="utf-8") as file: tokenizer = t.Tokenizer() for i, string in enumerate(file): # for each object in a generator for obj in tokenizer.iter_tokenize(string): if obj.kind == "alpha" or obj.kind == "digit": l = our_dict.setdefault(obj.word, []) l.append( Long_Position(obj.start, obj.end, i + 1, file.name)) return our_dict
def test_12(self): """ For a non-string (integer) object as an input. """ t = my_tokenizer_combined.Tokenizer() with self.assertRaises(ValueError): t.advanced_tokenize(5)
def get_context(self, file, file_pos, length): """ Returns a Context_Window instance for a word, given its position in a file. Takes the following arguments: file - the file name file_pos - a position of a word in a file length - the length of a context window """ if not isinstance(file, str): raise ValueError( "The first argument should be a string with file name.") if not isinstance(file_pos, my_indexer_combined.File_Position): raise ValueError( "The second argument should be a File_Position instance.") if not isinstance(length, int): raise ValueError("The third argument should be an integer number.") tokenizer = my_tokenizer_combined.Tokenizer() with open(file, encoding="utf-8") as f: # tokenize a sub-string to the right and an inverted sub-string to the left for i, string in enumerate(f): # we need to count lines to check smth later if file_pos.string_numb == i + 1: right_string = string # string[start:end:-1] returns an inverted string j = 0 # j - is just a counter start = 1 # if the 'lenght' of a context will go out of the string's range if file_pos.start != 1: # otherwise start = 1 and we don't need to alter it for token in tokenizer.iter_tokenize(right_string[file_pos.start - 2::-1]): if j < length: if token.kind == "alpha" or token.kind == "digit": j += 1 continue # cause we go through an inverted string start = file_pos.start - token.end + 1 break n = 0 # in case the 'lenght' of a context goes out of the string's range. end = len(right_string ) - 1 # cause the last symbol is '\n' - we don't need it for token in tokenizer.iter_tokenize(right_string[file_pos.end:]): if n < length: if token.kind == "alpha" or token.kind == "digit": n += 1 continue # cause 'token.end' is got from a substring end = file_pos.end + token.end - 1 break window = Context_Window([file_pos], start, end, right_string[:-1]) return window
def test_11(self): """ For an empty string. """ t = my_tokenizer_combined.Tokenizer() result = t.advanced_tokenize("") self.assertEqual(len(result), 0) self.assertEqual(result, [])
def the_second_simplest(self, query): """ A method that returns a dict with positions of tokens from the query in files. It takes a string as an input, tokenizes it, and returns a dict with names of files (that contain all tokens of the input) as keys and lists of their positions in those files as values. """ if not isinstance(query, str): raise ValueError("An argument should be a string.") dict_for_all = {} t = my_tokenizer_combined.Tokenizer() tokens = t.iter_tokenize(query.strip()) # delete first and last spaces # intersetion of keys - files, containing all tokens of the input. # here we collect them and write to the variable set_for_all for i, token in enumerate(tokens): if token.kind == "alpha" or token.kind == "digit": if i == 0: set_for_all = set(self.the_simplest(token.word)) set_for_all &= set(self.the_simplest(token.word)) # here we create the final dict tokens = t.iter_tokenize(query) for token in tokens: if token.kind == "alpha" or token.kind == "digit": for file in set_for_all: conjunction = dict_for_all.setdefault(file, []) # the_simplest method returns a dict with file names as keys conjunction += self.the_simplest(token.word)[file] # here for each key we sort its values' list (i.e. positions' list) for key in dict_for_all: dict_for_all[key].sort() ### NEW ### # here we go through all positions (i.e. values for each key that are file names) # and delete those which are the same for key in dict_for_all: new_positions = dict_for_all[key] # they are sorted i = 0 while i < len(new_positions) - 1: if new_positions[i] == new_positions[i + 1]: new_positions.pop(i) else: i += 1 dict_for_all[key] = new_positions return dict_for_all
def test_6(self): """ For only one punctuation mark as an input string. """ t = my_tokenizer_combined.Tokenizer() result = t.advanced_tokenize("!") self.assertEqual(len(result), 1) self.assertEqual(result[0].word, "!") self.assertEqual(result[0].start, 1) self.assertEqual(result[0].end, 1) self.assertEqual(result[0].length, 1) self.assertEqual(result[0].kind, "punct") self.assertIsInstance(result[0], my_tokenizer_combined.Advanced_Token)
def test_7(self): """ For only one aplha-token as an input string. """ t = my_tokenizer_combined.Tokenizer() result = t.advanced_tokenize("слово") self.assertEqual(len(result), 1) self.assertEqual(result[0].word, "слово") self.assertEqual(result[0].start, 1) self.assertEqual(result[0].end, 5) self.assertEqual(result[0].length, 5) self.assertEqual(result[0].kind, "alpha") self.assertIsInstance(result[0], my_tokenizer_combined.Advanced_Token)
def test_9(self): """ For only one digit-token as an input string. """ t = my_tokenizer_combined.Tokenizer() result = t.advanced_tokenize("1") self.assertEqual(len(result), 1) self.assertEqual(result[0].word, "1") self.assertEqual(result[0].start, 1) self.assertEqual(result[0].end, 1) self.assertEqual(result[0].length, 1) self.assertEqual(result[0].kind, "digit") self.assertIsInstance(result[0], my_tokenizer_combined.Advanced_Token)
def test_10(self): """ For only one "other" unicode symbol as an input string. """ t = my_tokenizer_combined.Tokenizer() result = t.advanced_tokenize("$") self.assertEqual(len(result), 1) self.assertEqual(result[0].word, "$") self.assertEqual(result[0].start, 1) self.assertEqual(result[0].end, 1) self.assertEqual(result[0].length, 1) self.assertEqual(result[0].kind, "other") self.assertIsInstance(result[0], my_tokenizer_combined.Advanced_Token)
def the_second_simplest(self, query, doc_limit, doc_offset): #### NEW #### """ A method that returns a dict with GENERATOR of positions of tokens from the query in files. It takes a string 'query' as an input, tokenizes it, and returns a dict with names of files (that contain all tokens of the input) as keys and GENERATORS from lists of positions in those files as values. Its arguments also include 'doc_limit' and 'doc_offset', which indicate what part of documents will be shown on the final page via our server. """ if not isinstance(query, str): raise ValueError("An argument should be a string.") dict_for_all = {} t = my_tokenizer_combined.Tokenizer() tokens = t.iter_tokenize(query.strip()) # delete first and last spaces # intersetion of keys - files, containing all tokens of the input. # here we collect them and write to the variable set_for_all for i, token in enumerate(tokens): if token.kind == "alpha" or token.kind == "digit": if i == 0: set_for_all = set(self.the_simplest(token.word)) set_for_all &= set(self.the_simplest(token.word)) list_from_set = list(set_for_all) list_from_set.sort() # now we take only part of list_from_set new_list = list_from_set[doc_offset:doc_offset + doc_limit] ### NEW ### # create dict with file names as keys and list of lists as values, where # each inner list contains positions in this file for one word from a query for file in new_list: tokens = t.iter_tokenize(query) list_of_lists = [] for token in tokens: current_list = [] if token.kind == "alpha" or token.kind == "digit": current_list = self.the_simplest(token.word)[file] list_of_lists.append(current_list) # for abstract_iterator to deal with context windows we defined __lt__ for them dict_for_all[file] = self.abstract_iterator(list_of_lists) # here we delete equal positions, otherwise we get "князькнязь..." dict_for_all[file] = self.gen_delete_equal(dict_for_all[file]) return dict_for_all
def test_3(self): """ For a string that starts and (=or) ends with a digit. """ t = my_tokenizer_combined.Tokenizer() result = t.advanced_tokenize("1 some string with digits 5") self.assertEqual(len(result), 11) self.assertEqual(result[0].word, "1") self.assertEqual(result[0].length, 1) self.assertEqual(result[0].kind, "digit") self.assertEqual(result[2].word, "some") self.assertEqual(result[2].kind, "alpha") self.assertEqual(result[10].word, "5") self.assertEqual(result[10].length, 1) self.assertEqual(result[10].kind, "digit")
def test_2(self): """ For a string that starts and (=or) ends with a space. """ t = my_tokenizer_combined.Tokenizer() result = t.advanced_tokenize(" some string with spaces ") self.assertEqual(len(result), 9) self.assertEqual(result[0].word, " ") self.assertEqual(result[0].length, 2) self.assertEqual(result[0].kind, "space") self.assertEqual(result[1].word, "some") self.assertEqual(result[1].kind, "alpha") self.assertEqual(result[8].word, " ") self.assertEqual(result[8].length, 1) self.assertEqual(result[8].kind, "space")
def test_5(self): """ For a string that starts and (=or) ends with an "other" unicode symbol. """ t = my_tokenizer_combined.Tokenizer() result = t.advanced_tokenize("$some string with \"other\" symols$") self.assertEqual(len(result), 13) self.assertEqual(result[0].word, "$") self.assertEqual(result[0].length, 1) self.assertEqual(result[0].kind, "other") self.assertEqual(result[1].word, "some") self.assertEqual(result[1].kind, "alpha") self.assertEqual(result[2].word, " ") self.assertEqual(result[2].length, 1) self.assertEqual(result[2].kind, "space") self.assertEqual(result[12].word, "$") self.assertEqual(result[12].length, 1) self.assertEqual(result[12].kind, "other")
def test_4(self): """ For a string that starts and (=or) ends with a punctuation mark. """ t = my_tokenizer_combined.Tokenizer() result = t.advanced_tokenize("_some string with punctuation_") self.assertEqual(len(result), 9) self.assertEqual(result[0].word, "_") self.assertEqual(result[0].length, 1) self.assertEqual(result[0].kind, "punct") self.assertEqual(result[1].word, "some") self.assertEqual(result[1].kind, "alpha") self.assertEqual(result[2].word, " ") self.assertEqual(result[2].length, 1) self.assertEqual(result[2].kind, "space") self.assertEqual(result[8].word, "_") self.assertEqual(result[8].length, 1) self.assertEqual(result[8].kind, "punct")
def create_index_from_string(self, string): """ Takes a string as an argument and creates a dictionary, which contains words or digits as keys and a list of their 'basic' positions in the string as values. """ if not isinstance(string, str): raise ValueError("This indexer works only with strings. ") our_dict = {} for obj in t.Tokenizer().iter_tokenize(string): # we include only words or digits in the final dict if obj.kind == "alpha" or obj.kind == "digit": # setdefault returns dict[key], adding a key with the default # value if the key is not in the dict yet l = our_dict.setdefault(obj.word, []) l.append(Basic_Position(obj.start, obj.end)) return our_dict
def get_window(self, file_pos, right_string, length): """ Returns a Context_Window instance, given a position of a word in a file, the corresponding string from the file and the length of the window. """ # this part of code is from get_context method tokenizer = my_tokenizer_combined.Tokenizer() # string[start:end:-1] returns an inverted string j = 0 # j - is just a counter start = 1 # if the 'lenght' of a context will go out of the string's range if file_pos.start != 1: # otherwise start = 1 and we don't need to alter it for token in tokenizer.iter_tokenize(right_string[file_pos.start - 2::-1]): if j < length: if token.kind == "alpha" or token.kind == "digit": j += 1 continue # cause we go through an inverted string start = file_pos.start - token.end + 1 break n = 0 # in case the 'lenght' of a context goes out of the string's range. end = len(right_string ) - 1 # cause the last symbol is '\r' - we don't need it for token in tokenizer.iter_tokenize(right_string[file_pos.end:]): if n < length: if token.kind == "alpha" or token.kind == "digit": n += 1 continue # cause 'token.end' is got from a substring end = file_pos.end + token.end - 1 break # [:-1] - not to include the final '\r' return Context_Window([file_pos], start, end, right_string[:-1])