def __init__(self, database): """ @param database: datadase of tokens and thier positions """ self.database = shelve.open(database, writeback=True) self.tokenizator = Tokenizator()
def __init__(self, database): """ Constructor for an database @param database: element of Indexer, contains a dictionary where token is a key, and a value is a dictionary where the key is filename, and the value is a list of positions """ self.database = shelve.open(database, writeback=True) self.tokenizator = Tokenizator()
class Indexer(object): """ class Indexer Contains database with indexed tokens """ def __init__(self, database): """ Constructor for an database @param database: element of Indexer, contains a dictionary where token is a key, and a value is a dictionary where the key is filename, and the value is a list of positions """ self.database = shelve.open(database, writeback=True) self.tokenizator = Tokenizator() def __del__(self): self.database.close() def get_index(self, filename): """ This function performs indexing of a text in a given file """ if not isinstance(filename, str): raise TypeError('Input has an unappropriate type!') my_file = open(filename) for token in self.tokenizator.token_gen(my_file.read()): start = token.position end = start + len(token.s) pos = Position(start, end) self.database.setdefault(token.s, {}).setdefault(filename, []).append(pos) my_file.close() def get_index_with_line(self, filename): """ This function performs indexing of a text in a given file """ if not isinstance(filename, str): raise TypeError('Input has an unappropriate type!') my_file = open(filename) for lnumber, line in enumerate(my_file): if not line: lnumber += 1 for token in self.tokenizator.token_gen(line): start = token.position end = start + len(token.s) pos = Position_Plus(lnumber, start, end) self.database.setdefault(token.s, {}).setdefault(filename, []).append(pos) lnumber += 1 my_file.close()
class MakingArrayTest(unittest.TestCase): def setUp(self): self.Tokenizator = Tokenizator() def test_words_split(self): s = 'мама мыла раму' self.assertEqual(self.Tokenizator.tokenize(s), ['мама', 'мыла', 'раму']) def test_isalnum(self): s = 'а233465' self.assertEqual(self.Tokenizator.tokenize(s), ['а']) def test_isanalpha(self): s = 'Мамамылараму' self.assertEqual(self.Tokenizator.tokenize(s), ['Мамамылараму']) s = '7574мама 00мыла 778раму' self.assertEqual(self.Tokenizator.tokenize(s), ['мама', 'мыла', 'раму']) def test_empty_string(self): s = '' self.assertEqual(self.Tokenizator.tokenize(s), [])
def createIndexFile(self): print 'Starting Index Creation...' #Open Collection File documents = ReadDocuments(self.__collection) for doc in documents: #Load Document Id docid = doc.docid #Counting collection Size self.__collectionSize += 1 for line in doc.lines: token = Tokenizator(line, self.__tok).toToken() for tok in token: word = Stemmator(tok, self.__stem).toStem() if word not in self.__stoplist: if word not in self.__index: self.__index[word][docid] = 1 self.__dic_df[word] = 1 else: if docid in self.__index[word]: self.__index[word][docid] += 1 else: self.__index[word][docid] = 1 self.__dic_df[word] += 1 f = open(self.__indexFile, 'w') print >> f, "<IndexFile Size_Collection=", int( self.__collectionSize), ">\n" for word in self.__index: self.__dic_idf[word] = math.log10(self.__collectionSize / self.__dic_df[word]) print >> f, "\t<word=", word, " document_frequency=", self.__dic_df[ word], " inverse_document_frequency=", self.__dic_idf[ word], ">" for idoc in self.__index[word]: self.__dic_tfidf[idoc][word] = int( self.__index[word][idoc]) * self.__dic_idf[word] print >> f, "\t\t<iddoc=", int(idoc), " term_frequency=", int( self.__index[word] [idoc]), " tfidf=", self.__dic_tfidf[idoc][word], " />" print >> f, "\t</word>\n" print >> f, "</IndexFile>" f.close() print 'Index Created...' return self.__index
def __CreateQuerieDict(self,onequerie): print 'Loading Queries...' documents = ReadDocuments(self.__querieFile) queries = NestedDict() for doc in documents: docid = doc.docid if onequerie!= 0: if docid != onequerie: continue for line in doc.lines: token = Tokenizator(line,self.__tok).toToken() for tok in token: word = Stemmator(tok,self.__stem).toStem() if word not in self.__stoplist: if word not in queries[docid]: queries[docid][word]=0 queries[docid][word]+=1 print 'Queries loaded...' return queries
def setUp(self): self.Tokenizator = Tokenizator()
def __loadStringQuery(self, opts): t = Tokenizator(opts, 'WPT').toToken() return t
class SearchEngine(object): """ class SearchEngine """ def __init__(self, database): """ @param database: datadase of tokens and thier positions """ self.database = shelve.open(database, writeback=True) self.tokenizator = Tokenizator() def __del__(self): self.database.close() def get_dict(self, tok_str): """ This function performs searching for positions of a given token @param tok_str: str containing token @return: dictionary, where a key is a filename and a value is a list of positions """ if not isinstance(tok_str, str): raise TypeError('Input has an unappropriate type!') if tok_str in self.database: return self.database[tok_str] else: return {} def get_dict_many_tokens(self, tok_str): """ This function performs searching for positions of given tokens @param tok_str: str containing tokens @return: dictionary, where a key is a filename and a value is a list of positions of all tokens """ if not isinstance(tok_str, str): raise TypeError('Input has an unappropriate type!') if not tok_str: return {} big_dict_files = [] for token in self.tokenizator.token_gen(tok_str): big_dict_files.append(self.get_dict( token.s)) #выделяем токены и зап-ем в список files = set(big_dict_files[0]) for file_dict in big_dict_files[1:]: files = files.intersection( set(file_dict)) #пересечение названия файлов output_dict = {} for filename in files: for token in self.tokenizator.token_gen(tok_str): output_dict.setdefault(filename, []).extend( self.database[token.s][filename]) # sort positions output_dict[filename].sort() return output_dict def get_dict_many_tokens_limit_offset(self, tok_str, limit=3, offset=0): """ This function performs searching for positions of given tokens @param tok_str: str containing tokens @param limit: number of files to be returned @param offset: from which file to start @return: dictionary, where a key is a filename and a value is a list of positions of all tokens """ if not isinstance(tok_str, str): raise TypeError('Input has an unappropriate type!') if not isinstance(limit, int) or not isinstance(offset, int): raise TypeError('Input has an unappropriate type!') if not tok_str: return {} # в случае, если оффсет отрицательный if offset < 0: offset = 0 big_dict_files = [] for token in self.tokenizator.token_gen(tok_str): # выделяем токены и зап-ем в список big_dict_files.append(self.get_dict(token.s)) files = set(big_dict_files[0]) for file_dict in big_dict_files[1:]: # пересечение названия файлов files = files.intersection(set(file_dict)) # сортирую и отсекаю результаты по лимиту и оффсету resulted_files = sorted(files)[offset:limit + offset] # создаю результурующий словарь output_dict = {} # записываю в него нужные результаты for filename in resulted_files: for token in self.tokenizator.token_gen(tok_str): output_dict.setdefault(filename, []).extend( self.database[token.s][filename]) # sort positions output_dict[filename].sort() return output_dict def get_dict_many_tokens_limit_offset_generator(self, tok_str, limit=3, offset=0): """ This function performs searching for positions of given tokens @param tok_str: str containing tokens @param limit: number of files to be returned @param offset: from which file to start @return: dictionary, where a key is a filename and a value is a position generator """ if not isinstance(tok_str, str): raise TypeError('Input has an unappropriate type!') if not isinstance(limit, int) or not isinstance(offset, int): raise TypeError('Input has an unappropriate type!') if not tok_str: return {} # в случае, если оффсет отрицательный if offset < 0: offset = 0 big_dict_files = [] # словарь вида имя файла:список позиций lists = {} for token in self.tokenizator.token_gen(tok_str): # ищу токен с строковом представлении found = self.get_dict(token.s) # добавляю в список big_dict_files.append(set(found)) # заполняю словарь lists for file in found: lists.setdefault(file, []).append(found[file]) files = big_dict_files[0] for file_dict in big_dict_files[1:]: # пересечение названий файлов files = files.intersection(set(file_dict)) # сортирую и отсекаю результаты по лимиту и оффсету resulted_files = sorted(files)[offset:limit + offset] # создаю результурующий словарь output_dict = {} # записываю в него нужные результаты for filename in resulted_files: for token in self.tokenizator.token_gen(tok_str): output_dict[filename] = self.position_generator( lists[filename]) return output_dict def unite_all(self, dictionary, win_size): ''' This function unites context windows @param dictionary: input dictionary filename:Positions @param win_size: a size of a context window @return: a dictionary filename:Context Windows генератор контекстных окон ''' if not isinstance(dictionary, dict): raise TypeError('Input has an unappropriate type!') output_dict = {} # value is an array of positions for key, value in dictionary.items(): # создаем список каждый раз, чтобы у каждого окна был свой список позиций win_array = output_dict.setdefault(key, []) pos_array = value # print(pos_array,'pos_array') # for each position in values get window() for num, pos in enumerate(pos_array): # print(pos,'pos') # когда мы проходим по массиву и сравниваем элемент с предыдущим надо помнить, что мы начинаем с 0 элемента и если мы сравниваем его с -1, # то мы сравниваем с тем элементом, который в самом конце, а это нам не надо; еще может статься, что элемент будет сравниваться сам с собой, если он там один # то он будет удален так, как если бы он был дубликатом(по факту он дублирует сам себя) и по итогу имеем пустой массив и все плохо!!! вот так))) # поэтому проверяем if num > 0 if num > 0 and pos_array[num] == pos_array[num - 1]: # print('positions are equal!!!') continue # print('positions are not equal!!') window = Context_Window.get_window(key, pos, win_size) win_array.append(window) # print(window,'window!!!') i = 0 # тут окна объединяются for key, win_array in output_dict.items(): while i < len(win_array) - 1: if win_array[i].is_crossed(win_array[i + 1]): win_array[i].get_united_window(win_array[i + 1]) win_array.remove(win_array[i + 1]) else: i += 1 return output_dict def unite_extended(self, query, win_size): ''' This function unites extended windows in a dictionary It takes query and win_size, makes a dictionary from query, than makes a new one but with windows as values, extends these windows and finally unites them once again @param query: string query @param win_size: size of a window @return: dictionary with extended and reunated windows генератор окон!!! ''' if not isinstance(query, str) or not isinstance(win_size, int): raise TypeError('Input has an unappropriate type! %s, %s' % (query, win_size)) to_dict = self.get_dict_many_tokens(query) # print(to_dict,'to_dict') dictionary = self.unite_all(to_dict, win_size) # print(dictionary,'dictionary') for value in dictionary.values(): # print(value,'value') for window in value: # если функция только модифицирует и ничего не возвращает # вызывай ее вот так и не путай! здесь я расширяю окно до границ предложения window.extend_window() # print(window,'extended window!!!') # print('I want to reunite') for key, win_array in dictionary.items(): # print("I am in for") i = 0 while i < len(win_array) - 1: # print('I am in while') if win_array[i].is_crossed(win_array[i + 1]): # print(win_array[i].is_crossed(win_array[i+1]),'is crossed') win_array[i].get_united_window(win_array[i + 1]) # print('get_united') win_array.remove(win_array[i + 1]) else: i += 1 return dictionary def unite_extended_limit_offset(self, query, win_size, limit=3, offset=0): ''' This function unites extended windows in a dictionary(this dictionary contains only limited number of documents starting from offset) It takes query and win_size, makes a dictionary from query, than makes a new one but with windows as values, extends these windows and finally unites them once again @param query: string query @param win_size: size of a window @param limit: number of documents to return @param offset: from which document to start @return: dictionary with extended and reunated windows ''' if not isinstance(query, str) or not isinstance(win_size, int): raise TypeError('Input has an unappropriate type! %s, %s' % (query, win_size)) if not isinstance(limit, int) or not isinstance(offset, int): raise TypeError('Input has an unappropriate type!') # теперь я использую новую функцию поиска по нескольким токенам, # которая учитывает лимит и оффсет to_dict = self.get_dict_many_tokens_limit_offset(query, limit, offset) # print(to_dict,'to_dict') dictionary = self.unite_all(to_dict, win_size) # print(dictionary,'dictionary') for value in dictionary.values(): # print(value,'value') for window in value: # если функция только модифицирует и ничего не возвращает # вызывай ее вот так и не путай! здесь я расширяю окно до границ предложения window.extend_window() # print(window,'extended window!!!') # print('I want to reunite') for key, win_array in dictionary.items(): # print("I am in for") i = 0 while i < len(win_array) - 1: # print('I am in while') if win_array[i].is_crossed(win_array[i + 1]): # print(win_array[i].is_crossed(win_array[i+1]),'is crossed') win_array[i].get_united_window(win_array[i + 1]) # print('get_united') win_array.remove(win_array[i + 1]) else: i += 1 return dictionary def get_context_gen(self, query, win_size, limit, offset): ''' This function uses a generator of context windows to produce a dictionary with windows generator windows are extended and reunited @param query: string query @param win_size: size of a window @param limit: number of documents to return @param offset: from which document to start @return: dictionary with a generator of extended and reunated windows ''' if not isinstance(query, str) or not isinstance(win_size, int): raise TypeError('Input has an unappropriate type! %s, %s' % (query, win_size)) if not isinstance(limit, int) or not isinstance(offset, int): raise TypeError('Input has an unappropriate type!') # получаю словарь вида имя файла - генератор позиций position_gen_dict = self.get_dict_many_tokens_limit_offset_generator( query, limit, offset) # делаю словарь вида имя файла - генератор окон window_gen_dict = dict() for filename in position_gen_dict: window_gen_dict[filename] = self.context_generator( filename, position_gen_dict[filename], win_size) # делаю словарь вида имя файла - генератор расширенных # и объединенных окон context_dict = dict() # print('i am here') for filename in window_gen_dict: # print('i am extending') context_dict[filename] = self.context_gen_uniter( window_gen_dict[filename]) # print(context_dict[filename],'the result of extension') # это результат работы данной функции return context_dict def get_sentence_gen(self, query, win_size, limit, offset): ''' This function uses a generator of sentences to produce a dictionary with sentence generator sentences are extended and reunited @param query: string query @param win_size: size of a window @param limit: number of documents to return @param offset: from which document to start @return: dictionary with a generator of extended and reunated sentences ''' if not isinstance(query, str) or not isinstance(win_size, int): raise TypeError('Input has an unappropriate type! %s, %s' % (query, win_size)) if not isinstance(limit, int) or not isinstance(offset, int): raise TypeError('Input has an unappropriate type!') # получаю словарь вида имя файла - генератор расширенных # и объединенных окон context_dict = self.get_context_gen(query, win_size, limit, offset) print(list(context_dict), 'context_dict') # создаю словарь вида имя файла - генератор предложений sentence_dict = dict() for filename in context_dict: sentence_dict[filename] = self.sentence_generator( context_dict[filename]) print(list(sentence_dict[filename]), 'sentence_dict[filename]') # создаю результурующий словарь с расширенными # и объединенными предложеиями final_sentence_dict = dict() for filename in sentence_dict: try: final_sentence_dict[filename] = self.sentence_generator_uniter( sentence_dict[filename]) print(list(final_sentence_dict[filename]), 'final!!!!') except StopIteration: break # это результат работы данной функции return final_sentence_dict def query_search(self, query, win_size): ''' This function performs searching a query in database and returs a dictionary filemname:query in string format @param query: query to search @param win_size: a size of a context window @return: dictionary {filename: [query(str)]} ''' if not isinstance(query, str) or not isinstance(win_size, int): raise TypeError('Input has an unappropriate type! %s, %s' % (query, win_size)) output_dict = {} dictionary = self.unite_extended(query, win_size=1) # print(dictionary,'dictionary') for key, value in dictionary.items(): # print(value,'value') for window in value: string = window.highlight_window() # print(string,'string') output_dict.setdefault(key, []).append(string) # print(output_dict,'dict') return output_dict def query_search_modified(self, query, win_size=1, limit=3, offset=0): ''' This function performs searching a query in database and returs a dictionary filemname:query in string format It uses a new search function named def unite_extended_limit_offset() and for that it takes limit and offset as it's arguments @param query: query to search @param win_size: a size of a context window @param limit: number of documents to return @param offset: from which document to start @return: dictionary {filename: [query(str)]} ''' if not isinstance(query, str) or not isinstance(win_size, int): raise TypeError('Input has an unappropriate type! %s, %s' % (query, win_size)) output_dict = {} dictionary = self.unite_extended_limit_offset(query, win_size, limit, offset) # print(dictionary,'dictionary') for key, value in dictionary.items(): # print(value,'value') for window in value: string = window.highlight_window() # print(string,'string') output_dict.setdefault(key, []).append(string) # print(output_dict,'dict') return output_dict def qulim_search(self, query, win_size, limit, offset, doc_limof): ''' This function performs searching a query in database and returs a dictionary filemname:query in string format @param query: query to search @param win_size: a size of a context window @param limit: max number of documents to show @param offset: document number to start with @param doc_limof: list of pairs that show limit and offset of each concrete document, no more quotes can be shown than this doclimit @return: dictionary {filename: [query(str)]} ''' if not isinstance(query, str) or not isinstance( limit, int) or not isinstance(offset, int): raise TypeError('Input has an unappropriate type! %s, %s, %s' % (query, limit, offset)) # dictionary for results output_dict = dict() # number of document qunum = 0 dictionary = self.unite_extended(query, win_size) # print(dictionary, 'dictionary') # print(doc_limof,'doc_limof') for number, filename in enumerate(sorted(dictionary)): #print('I am in for circle!') #print(filename,'filename') #print(number,'number') if number == limit + offset: break if number >= offset and number < limit + offset: #print(number,'number again!!!') # тут я создаю список для каждого файла output_dict.setdefault(filename, []) # get all the qoutes in file all_quotes = dictionary[filename] # print(all_quotes,'all_quotes') # limit for document qulim = doc_limof[qunum][0] # print(qulim, 'qulim') # offset for document quset = doc_limof[qunum][1] #print(quset,'quset') for num, quote in enumerate(all_quotes): #print('I am in the second for circle!!!') #print(num,'num!!!') if num == qulim + quset: break if num >= quset and num < qulim + quset: #print(quset,'quset') #print(qulim + quset,'qulim + quset') output_dict[filename].append(quote.highlight_window()) #print("I got a quote!") # print(quote,'quote!!!') qunum += 1 # print(output_dict, 'output_dict') return output_dict def qulim_search_modified(self, query, win_size=1, limit=3, offset=0, doc_limof=[(3, 0), (3, 0), (3, 0)]): ''' This function performs searching a query in database and returs a dictionary filemname:query in string format @param query: query to search @param win_size: a size of a context window @param limit: max number of documents to show @param offset: document number to start with @param doc_limof: list of pairs that show limit and offset of each concrete document, no more quotes can be shown than this doclimit @return: dictionary {filename: [query(str)]} тут тоже генератор ''' if not isinstance(query, str) or not isinstance( limit, int) or not isinstance(offset, int): raise TypeError('Input has an unappropriate type! %s, %s, %s' % (query, limit, offset)) # dictionary for results output_dict = dict() # number of document qunum = 0 dictionary = self.unite_extended_limit_offset(query, win_size, limit, offset) # мне не нужно сравнивать с лимитами и офсетами по документам, # так как я уже использую новую функцию, где они уже учтены # поэтому я просто перебираю файлы в отсортированном словаре for filename in sorted(dictionary): # тут я создаю список для каждого файла output_dict.setdefault(filename, []) # достаю все limit цитат из словаря по данному файлу all_quotes = dictionary[filename] # print(all_quotes,'all_quotes') # limit for document qulim = doc_limof[qunum][0] if not qulim: qulim = 3 # print(qulim, 'qulim') # offset for document quset = doc_limof[qunum][1] if not quset: quset = 0 #print(quset,'quset') for num, quote in enumerate(all_quotes): #print('I am in the second for circle!!!') #print(num,'num!!!') if num == qulim + quset: break if num >= quset and num < qulim + quset: #print(quset,'quset') #print(qulim + quset,'qulim + quset') output_dict[filename].append(quote.highlight_window()) #print("I got a quote!") # print(quote,'quote!!!') qunum += 1 # print(output_dict, 'output_dict') return output_dict def qulim_search_modified_gen(self, query, win_size=1, limit=3, offset=0, doc_limof=[(3, 0), (3, 0), (3, 0)]): """ This function performs searching a query in database and returs a dictionary filemname:query in string format It uses generators to work faster than the previous function named qulim_search_modified @param query: query to search @param win_size: a size of a context window @param limit: max number of documents to show @param offset: document number to start with @param doc_limof: list of pairs that show limit and offset of each concrete document, no more quotes can be shown than this doclimit @return: dictionary {filename: [query(str)]} """ if not isinstance(query, str) or not isinstance( limit, int) or not isinstance(offset, int): raise TypeError('Input has an unappropriate type! %s, %s, %s' % (query, limit, offset)) # dictionary for results output_dict = dict() # number of document qunum = 0 # using brand new function with generator)) dictionary = self.get_sentence_gen(query, win_size, limit, offset) for filename in sorted(dictionary): qulim = doc_limof[qunum][0] if not qulim: qulim = 3 # print(qulim, 'qulim') # offset for document quset = doc_limof[qunum][1] if not quset: quset = 0 # print(quset,'quset') output_dict.setdefault(filename, []) for item in range(quset): next(dictionary[item]) for item in range(qulim): try: output_dict[filename].append( next(dictionary[item]).highlight_window()) except StopIteration: break qunum += 1 return output_dict def position_generator(self, lists): ''' This function generates positions @param lists: list of lists of positions It chooses a position with a min frequency upon firsts elements of each list given and yeilds it ''' # превращаю списки в итераторы. итератор - генератор, осуществляющий итерацию. iters = [iter(x) for x in lists] # список с первыми элементами списков firsts = [next(it) for it in iters] # print(firsts,'firsts') while firsts: position_iter = min(firsts) yield position_iter # print(position_iter,'position_iter') # номер массива, из которого я взяла этот элемент position_iter_pos = firsts.index(position_iter) try: # переходим к следующему элементу в этом списке firsts[position_iter_pos] = next(iters[position_iter_pos]) except StopIteration: # если один из списков закончился, то удаляем и возвращаем удаленный элемент # те первый элемент такого списка и его итератор нам больше не нужны и мы их удаляем iters.pop(position_iter_pos) firsts.pop(position_iter_pos) def context_generator(self, filename, position_generator, win_size): """ This function creates context windows from a given file using the position generator @param filename: a name of a file @param position_generator: generator which generates positions @param win_size: a size of a future context window @return: contexts windows, i.e. objects of Context_Window class """ if not isinstance(filename, str) or not isinstance(win_size, int): raise TypeError('Input has an unappropriate type!') for pos in position_generator: window = Context_Window.get_window(filename, pos, win_size) yield window def context_gen_uniter(self, context_generator): """ This function checks if generated windows intersect and unites them @param context_generator: generator of context windows @return: united context windows """ # делаю из входных данных итератор, чтобы итераторить)) iterator = context_generator.__iter__() # первое окно в итераторе, начало прохода previous = iterator.__next__() # print(previous,'previous') for window in context_generator: # заворачиваю в блок, чтобы избежать ошибки try: # второе окно после первого next_window = iterator.__next__() # print(next_window,'next_window') # проверяю на пересечение и если что - объединяю if previous.is_crossed(next_window): previous.get_united_window(next_window) # print('united window yielded') else: yield previous # print(previous,'I just yield window') previous = next_window except StopIteration: break # print('I stop the iteration') yield previous def sentence_generator(self, context_gen_uniter): """ This function generates sentences using the context_gen_uniter @param context_gen_uniter: generator of united context windows @return: extended windows """ for window in context_gen_uniter: window.extend_window() yield window def sentence_generator_uniter(self, sentence_generator): """ This function checks if generated sentences intersect and unites them @param sentence_generator: generator of sentences @return: extended windows after thier union """ iterator = sentence_generator.__iter__() previous = iterator.__next__() for extended_window in sentence_generator: print(extended_window, 'extended_window') try: # если на вход было всего одно предложение, то в этом месте будет StopIteration? next_window = iterator.__next__() print(next_window, 'next_window') if previous.is_crossed(next_window): previous.get_united_window(next_window) print(previous, 'i united windows!') else: yield previous print(previous, 'I could not unite so this is just a window') previous = next_window except StopIteration: break print('I stop the iteration') yield previous
class Context_Window(object): """ class Context_Window """ tokenizator = Tokenizator() def __init__(self, string, positions, win_start, win_end): """ Constructor of a context window @param positions: positions of tokens @param string: string representation of a token @param win_start: position where window starts @param win_end: position where window ends """ self.string = string self.positions = positions self.win_start = win_start self.win_end = win_end def __eq__(self, window): return self.string == window.string and self.positions == window.positions and self.win_start == window.win_start and self.win_end == window.win_end def __repr__(self): return str(self.string) + ' ' + str(self.positions) + ' ' + str( self.win_start) + ' ' + str(self.win_end) @classmethod def get_window(cls, filename, position, win_size): """ This function returns a context window of a given token's position @param filename: a name of a file where token is to be found @param position: a position of a token @param win_size: desirable size of the context window @return: a context window """ if not isinstance(filename, str) or not isinstance(win_size, int): raise TypeError('Input has an unappropriate type!') positions = [] positions.append(position) win_end = 0 win_start = 0 string = None str_num = position.lnumber my_file = open(filename) for lnumber, my_string in enumerate(my_file): if lnumber == str_num: string = my_string break if string == None: my_file.close() raise TypeError('This string was not found!') for tok_num, token in enumerate( cls.tokenizator.token_gen(string[position.start:])): if tok_num == 0: win_end = position.end if tok_num == win_size: win_end = token.position + len(token.s) + position.start break for tok_num, token in enumerate( cls.tokenizator.token_gen(string[:position.end][::-1])): if tok_num == win_size: win_start = position.end - token.position - len(token.s) break my_file.close() return cls(string, positions, win_start, win_end) def is_crossed(self, window_B): ''' This function checks if windows are crossed @param window_B: the second window @return: True or False ''' if not isinstance(window_B, Context_Window): raise TypeError('Input has an unappropriate type!') if self.win_start <= window_B.win_end and self.win_end >= window_B.win_start and self.positions[ 0].lnumber == window_B.positions[0].lnumber: return True if self.win_start == window_B.win_start and self.win_end == window_B.win_end and self.positions[ 0].lnumber == window_B.positions[0].lnumber: return True else: return False def get_united_window(self, window_B): ''' This function unites two windows @param window_B: the second window It changes self so that is has new positions and returns nothing!! ''' if not isinstance(window_B, Context_Window): raise TypeError('Input has an unappropriate type!') '''for position in window_B.positions: if position not in self.positions: self.positions.append(position)''' self.positions.extend(window_B.positions) self.win_start = min(window_B.win_start, self.win_start) self.win_end = max(window_B.win_end, self.win_end) def extend_window(self): ''' This function extends a given window to sentence @return: an extended window ''' to_right = self.string[self.win_end:] to_left = self.string[:self.win_start + 1][::-1] left = PATTERN_LEFT.search(to_left) right = PATTERN_RIGHT.search(to_right) if left is None: self.win_start = 0 else: self.win_start -= left.start() if right is None: self.win_end = len(self.string) else: self.win_end += right.start() + 1 def highlight_window(self): ''' This function takes a substring of window string, which corresponds to the window size and highlights it ''' win_string = self.string[self.win_start:self.win_end] fin = '</b>' st = '<b>' for position in reversed(self.positions): end = position.end - self.win_start begin = position.start - self.win_start win_string_one = win_string[:end] + fin + win_string[end:] win_string = win_string_one[:begin] + st + win_string_one[begin:] return win_string
class TestMyCode(unittest.TestCase): # making a unit of Tokenizator class def setUp(self): self.x = Tokenizator() # the test itself def test_isgenerator(self): result = self.x.tokens_generator(' Ф 12 !!! @ # Alina is a student)))') self.assertIsInstance(result, Generator) def test_begins_with_no_alpha(self): result = list( self.x.tokens_generator(' ф 12 !!! @ # Alina is a student)))')) self.assertIsInstance(result, list) self.assertEqual(len(result), 5) self.assertEqual(result[0].s, 'ф') self.assertEqual(result[0].position, 1) self.assertEqual(result[1].s, 'Alina') self.assertEqual(result[1].position, 14) def test_begins_with_alpha(self): result = list( self.x.tokens_generator('ф 12 !!! @ # Alina is a student)))')) self.assertIsInstance(result, list) self.assertEqual(len(result), 5) self.assertEqual(result[0].s, 'ф') self.assertEqual(result[0].position, 0) self.assertEqual(result[1].s, 'Alina') self.assertEqual(result[1].position, 13) def test_ends_with_alpha(self): result = list( self.x.tokens_generator('ф 12 !!! @ # Alina is a student)))abc')) self.assertIsInstance(result, list) self.assertEqual(len(result), 6) self.assertEqual(result[0].s, 'ф') self.assertEqual(result[0].position, 0) self.assertEqual(result[5].s, 'abc') self.assertEqual(result[5].position, 34) def test_ends_with_no_alpha(self): result = list( self.x.tokens_generator('ф 12 !!! @ # Alina is a student)))')) self.assertIsInstance(result, list) self.assertEqual(len(result), 5) self.assertEqual(result[0].s, 'ф') self.assertEqual(result[0].position, 0) self.assertEqual(result[4].s, 'student') self.assertEqual(result[4].position, 24) def test_MyError_number(self): with self.assertRaises(ValueError): list(self.x.tokens_generator(12)) def test_MyError_notList(self): s = [1, 2, 3, 'this is my string'] with self.assertRaises(ValueError): list(self.x.tokens_generator(s)) def test_Function_begins_with_no_alpha(self): result = self.x.tokenize(' ф 12 !!! @ # Alina is a student)))') self.assertIsInstance(result, list) self.assertEqual(len(result), 5) self.assertEqual(result[0].s, 'ф') self.assertEqual(result[0].position, 1) self.assertEqual(result[1].s, 'Alina') self.assertEqual(result[1].position, 14) def test_function_begins_with_alpha(self): result = self.x.tokenize('ф 12 !!! @ # Alina is a student)))') self.assertIsInstance(result, list) self.assertEqual(len(result), 5) self.assertEqual(result[0].s, 'ф') self.assertEqual(result[0].position, 0) self.assertEqual(result[1].s, 'Alina') self.assertEqual(result[1].position, 13) def test_function_ends_with_alpha(self): result = self.x.tokenize('ф 12 !!! @ # Alina is a student)))abc') self.assertIsInstance(result, list) self.assertEqual(len(result), 6) self.assertEqual(result[0].s, 'ф') self.assertEqual(result[0].position, 0) self.assertEqual(result[5].s, 'abc') self.assertEqual(result[5].position, 34) def test_function_ends_with_no_alpha(self): result = self.x.tokenize('ф 12 !!! @ # Alina is a student)))') self.assertIsInstance(result, list) self.assertEqual(len(result), 5) self.assertEqual(result[0].s, 'ф') self.assertEqual(result[0].position, 0) self.assertEqual(result[4].s, 'student') self.assertEqual(result[4].position, 24) def test_MyError_function_number(self): with self.assertRaises(ValueError): self.x.tokenize(12) def test_MyError_function_notList(self): s = [1, 2, 3, 'this is my string'] with self.assertRaises(ValueError): self.x.tokenize(s) def test_isgenerator_for_token_gen(self): result = self.x.token_gen(' Ф 12 !!! @ # Alina is a student)))') self.assertIsInstance(result, Generator) def test_my_token_gen(self): result = list(self.x.token_gen(' Ф 12 !!! @ # Alina is a student)))')) self.assertIsInstance(result, list) self.assertEqual(len(result), 6) self.assertEqual(result[0].s, 'Ф') self.assertEqual(result[0].tp, 'alpha') self.assertEqual(result[0].position, 1) self.assertEqual(result[1].s, '12') self.assertEqual(result[1].tp, 'digit') self.assertEqual(result[1].position, 3) self.assertEqual(result[5].s, 'student') self.assertEqual(result[5].tp, 'alpha') self.assertEqual(result[5].position, 25) def test_MyError_token_gen_number(self): with self.assertRaises(ValueError): list(self.x.token_gen(12)) def test_MyError_token_gen_notList(self): s = [1, 2, 3, 'this is my string'] with self.assertRaises(ValueError): list(self.x.token_gen(s)) def test_empty_string(self): result = '' self.assertEqual(len(result), 0)