Python Tokenizator примеры использования

Язык программирования: Python

Пространство имен/Пакет: tokenizator

Класс/Тип: Tokenizator

Примеров на hotexamples.com: 11

Python Tokenizator - 11 примеров найдено. Это лучшие примеры Python кода для tokenizator.Tokenizator, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Tokenizator(7)

token_gen(3)

tokenize(2)

tokens_generator(1)

Пример #1

Показать файл

Файл: search.py Проект: AlinaZakharova1997/Tokenizator

    def __init__(self, database):
        """
        @param database: datadase of tokens and thier positions
        """

        self.database = shelve.open(database, writeback=True)
        self.tokenizator = Tokenizator()

Пример #2

Показать файл

 def __init__(self, database):
     """
     Constructor for an database
     @param database: element of Indexer,
     contains a dictionary where token is a key,
     and a value is a dictionary where the key is filename,
     and the value is a list of positions
     """
     self.database = shelve.open(database, writeback=True)
     self.tokenizator = Tokenizator()

Пример #3

Показать файл

class Indexer(object):
    """
    class Indexer
    Contains database with indexed tokens 
    """
    def __init__(self, database):
        """
        Constructor for an database
        @param database: element of Indexer,
        contains a dictionary where token is a key,
        and a value is a dictionary where the key is filename,
        and the value is a list of positions
        """
        self.database = shelve.open(database, writeback=True)
        self.tokenizator = Tokenizator()

    def __del__(self):
        self.database.close()

    def get_index(self, filename):
        """
        This function performs indexing of a text in a given file
        """
        if not isinstance(filename, str):
            raise TypeError('Input has an unappropriate type!')
        my_file = open(filename)
        for token in self.tokenizator.token_gen(my_file.read()):
            start = token.position
            end = start + len(token.s)
            pos = Position(start, end)
            self.database.setdefault(token.s, {}).setdefault(filename,
                                                             []).append(pos)
        my_file.close()

    def get_index_with_line(self, filename):
        """
        This function performs indexing of a text in a given file
        """
        if not isinstance(filename, str):
            raise TypeError('Input has an unappropriate type!')
        my_file = open(filename)
        for lnumber, line in enumerate(my_file):
            if not line:
                lnumber += 1
            for token in self.tokenizator.token_gen(line):
                start = token.position
                end = start + len(token.s)
                pos = Position_Plus(lnumber, start, end)
                self.database.setdefault(token.s,
                                         {}).setdefault(filename,
                                                        []).append(pos)
            lnumber += 1
        my_file.close()

Пример #4

Показать файл

Файл: testTokenizator.py Проект: sonyashkapa/Sonya-Shkapa

class MakingArrayTest(unittest.TestCase):
    def setUp(self):
        self.Tokenizator = Tokenizator()

    def test_words_split(self):
        s = 'мама мыла раму'
        self.assertEqual(self.Tokenizator.tokenize(s),
                         ['мама', 'мыла', 'раму'])

    def test_isalnum(self):
        s = 'а233465'
        self.assertEqual(self.Tokenizator.tokenize(s), ['а'])

    def test_isanalpha(self):
        s = 'Мамамылараму'
        self.assertEqual(self.Tokenizator.tokenize(s), ['Мамамылараму'])
        s = '7574мама 00мыла 778раму'
        self.assertEqual(self.Tokenizator.tokenize(s),
                         ['мама', 'мыла', 'раму'])

    def test_empty_string(self):
        s = ''
        self.assertEqual(self.Tokenizator.tokenize(s), [])

Пример #5

Показать файл

    def createIndexFile(self):
        print 'Starting Index Creation...'
        #Open Collection File
        documents = ReadDocuments(self.__collection)
        for doc in documents:
            #Load Document Id
            docid = doc.docid
            #Counting collection Size
            self.__collectionSize += 1

            for line in doc.lines:
                token = Tokenizator(line, self.__tok).toToken()

                for tok in token:
                    word = Stemmator(tok, self.__stem).toStem()

                    if word not in self.__stoplist:
                        if word not in self.__index:
                            self.__index[word][docid] = 1
                            self.__dic_df[word] = 1

                        else:
                            if docid in self.__index[word]:
                                self.__index[word][docid] += 1

                            else:
                                self.__index[word][docid] = 1
                                self.__dic_df[word] += 1

        f = open(self.__indexFile, 'w')
        print >> f, "<IndexFile Size_Collection=", int(
            self.__collectionSize), ">\n"
        for word in self.__index:
            self.__dic_idf[word] = math.log10(self.__collectionSize /
                                              self.__dic_df[word])
            print >> f, "\t<word=", word, " document_frequency=", self.__dic_df[
                word], " inverse_document_frequency=", self.__dic_idf[
                    word], ">"
            for idoc in self.__index[word]:
                self.__dic_tfidf[idoc][word] = int(
                    self.__index[word][idoc]) * self.__dic_idf[word]
                print >> f, "\t\t<iddoc=", int(idoc), " term_frequency=", int(
                    self.__index[word]
                    [idoc]), " tfidf=", self.__dic_tfidf[idoc][word], " />"
            print >> f, "\t</word>\n"
        print >> f, "</IndexFile>"
        f.close()
        print 'Index Created...'
        return self.__index

Пример #6

Показать файл

Файл: querie_dict.py Проект: groadabike/Document_Retrieval

 def __CreateQuerieDict(self,onequerie):
     print 'Loading Queries...'
     documents = ReadDocuments(self.__querieFile)
     queries = NestedDict()
     for doc in documents:
         docid = doc.docid
         if onequerie!= 0:
             if  docid != onequerie:
                 continue
 
         for line in doc.lines: 
             token = Tokenizator(line,self.__tok).toToken()
             
             for tok in token:
                 word = Stemmator(tok,self.__stem).toStem()
                     
                 if word not in self.__stoplist:
                     if word not in queries[docid]:
                         queries[docid][word]=0
                     queries[docid][word]+=1
                     
     print 'Queries loaded...'  
     return queries

Пример #7

Показать файл

Файл: testTokenizator.py Проект: sonyashkapa/Sonya-Shkapa

 def setUp(self):
     self.Tokenizator = Tokenizator()

Пример #8

Показать файл

Файл: command_line.py Проект: groadabike/Document_Retrieval

 def __loadStringQuery(self, opts):
     t = Tokenizator(opts, 'WPT').toToken()
     return t

Пример #9

Показать файл

Файл: search.py Проект: AlinaZakharova1997/Tokenizator

class SearchEngine(object):
    """
    class SearchEngine
    """
    def __init__(self, database):
        """
        @param database: datadase of tokens and thier positions
        """

        self.database = shelve.open(database, writeback=True)
        self.tokenizator = Tokenizator()

    def __del__(self):
        self.database.close()

    def get_dict(self, tok_str):
        """
        This function performs searching for positions of a given token
        @param tok_str: str containing token
        @return: dictionary, where a key is a filename
        and a value is a list of positions
        """

        if not isinstance(tok_str, str):
            raise TypeError('Input has an unappropriate type!')

        if tok_str in self.database:
            return self.database[tok_str]
        else:
            return {}

    def get_dict_many_tokens(self, tok_str):
        """
        This function performs searching for positions of given tokens
        @param tok_str: str containing tokens
        @return: dictionary, where a key is a filename
        and a value is a list of positions of all tokens     
        """

        if not isinstance(tok_str, str):
            raise TypeError('Input has an unappropriate type!')
        if not tok_str:
            return {}
        big_dict_files = []
        for token in self.tokenizator.token_gen(tok_str):
            big_dict_files.append(self.get_dict(
                token.s))  #выделяем токены и зап-ем в список

        files = set(big_dict_files[0])
        for file_dict in big_dict_files[1:]:
            files = files.intersection(
                set(file_dict))  #пересечение названия файлов

        output_dict = {}
        for filename in files:
            for token in self.tokenizator.token_gen(tok_str):
                output_dict.setdefault(filename, []).extend(
                    self.database[token.s][filename])
            # sort positions
            output_dict[filename].sort()
        return output_dict

    def get_dict_many_tokens_limit_offset(self, tok_str, limit=3, offset=0):
        """
        This function performs searching for positions of given tokens
        @param tok_str: str containing tokens
        @param limit: number of files to be returned
        @param offset: from which file to start
        @return: dictionary, where a key is a filename
        and a value is a list of positions of all tokens     
        """

        if not isinstance(tok_str, str):
            raise TypeError('Input has an unappropriate type!')

        if not isinstance(limit, int) or not isinstance(offset, int):
            raise TypeError('Input has an unappropriate type!')

        if not tok_str:
            return {}

        # в случае, если оффсет отрицательный
        if offset < 0:
            offset = 0

        big_dict_files = []
        for token in self.tokenizator.token_gen(tok_str):
            # выделяем токены и зап-ем в список
            big_dict_files.append(self.get_dict(token.s))

        files = set(big_dict_files[0])
        for file_dict in big_dict_files[1:]:
            # пересечение названия файлов
            files = files.intersection(set(file_dict))

        # сортирую и отсекаю результаты по лимиту и оффсету
        resulted_files = sorted(files)[offset:limit + offset]
        # создаю результурующий словарь
        output_dict = {}
        # записываю в него нужные результаты
        for filename in resulted_files:
            for token in self.tokenizator.token_gen(tok_str):
                output_dict.setdefault(filename, []).extend(
                    self.database[token.s][filename])
            # sort positions
            output_dict[filename].sort()
        return output_dict

    def get_dict_many_tokens_limit_offset_generator(self,
                                                    tok_str,
                                                    limit=3,
                                                    offset=0):
        """
        This function performs searching for positions of given tokens
        @param tok_str: str containing tokens
        @param limit: number of files to be returned
        @param offset: from which file to start
        @return: dictionary, where a key is a filename
        and a value is a position generator    
        """

        if not isinstance(tok_str, str):
            raise TypeError('Input has an unappropriate type!')

        if not isinstance(limit, int) or not isinstance(offset, int):
            raise TypeError('Input has an unappropriate type!')

        if not tok_str:
            return {}

        # в случае, если оффсет отрицательный
        if offset < 0:
            offset = 0

        big_dict_files = []
        # словарь вида имя файла:список позиций
        lists = {}
        for token in self.tokenizator.token_gen(tok_str):
            # ищу токен с строковом представлении
            found = self.get_dict(token.s)
            # добавляю в список
            big_dict_files.append(set(found))
            # заполняю словарь lists
            for file in found:
                lists.setdefault(file, []).append(found[file])

        files = big_dict_files[0]
        for file_dict in big_dict_files[1:]:
            # пересечение названий файлов
            files = files.intersection(set(file_dict))

        # сортирую и отсекаю результаты по лимиту и оффсету
        resulted_files = sorted(files)[offset:limit + offset]
        # создаю результурующий словарь
        output_dict = {}
        # записываю в него нужные результаты
        for filename in resulted_files:
            for token in self.tokenizator.token_gen(tok_str):
                output_dict[filename] = self.position_generator(
                    lists[filename])
        return output_dict

    def unite_all(self, dictionary, win_size):
        '''
       This function unites context windows
       @param dictionary: input dictionary filename:Positions
       @param win_size: a size of a context window
       @return: a dictionary filename:Context Windows генератор контекстных окон
       '''
        if not isinstance(dictionary, dict):
            raise TypeError('Input has an unappropriate type!')
        output_dict = {}
        # value is an array of positions
        for key, value in dictionary.items():
            # создаем список каждый раз, чтобы у каждого окна был свой список позиций
            win_array = output_dict.setdefault(key, [])
            pos_array = value
            # print(pos_array,'pos_array')
            # for each position in values get window()
            for num, pos in enumerate(pos_array):
                # print(pos,'pos')
                # когда мы проходим по массиву и сравниваем элемент с предыдущим надо помнить, что мы начинаем с 0 элемента и если мы сравниваем его с -1,
                # то мы сравниваем с тем элементом, который в самом конце, а это нам не надо; еще может статься, что элемент будет сравниваться сам с собой, если он там один
                # то он будет удален так, как если бы он был дубликатом(по факту он дублирует сам себя) и по итогу имеем пустой массив и все плохо!!! вот так)))
                # поэтому проверяем if num > 0
                if num > 0 and pos_array[num] == pos_array[num - 1]:
                    # print('positions are equal!!!')
                    continue
                # print('positions are not equal!!')
                window = Context_Window.get_window(key, pos, win_size)
                win_array.append(window)
                # print(window,'window!!!')

        i = 0
        # тут окна объединяются
        for key, win_array in output_dict.items():
            while i < len(win_array) - 1:
                if win_array[i].is_crossed(win_array[i + 1]):
                    win_array[i].get_united_window(win_array[i + 1])
                    win_array.remove(win_array[i + 1])
                else:
                    i += 1

        return output_dict

    def unite_extended(self, query, win_size):
        '''
        This function unites extended windows in a dictionary
        It takes query and win_size, makes a dictionary from query,
        than makes a new one but with windows as values,
        extends these windows and finally unites them once again
        @param query: string query
        @param win_size: size of a window
        @return: dictionary with extended and reunated windows генератор окон!!!
        '''

        if not isinstance(query, str) or not isinstance(win_size, int):
            raise TypeError('Input has an unappropriate type! %s, %s' %
                            (query, win_size))

        to_dict = self.get_dict_many_tokens(query)
        # print(to_dict,'to_dict')
        dictionary = self.unite_all(to_dict, win_size)
        # print(dictionary,'dictionary')
        for value in dictionary.values():
            # print(value,'value')
            for window in value:
                # если функция только модифицирует и ничего не возвращает
                # вызывай ее вот так и не путай! здесь я расширяю окно до границ предложения
                window.extend_window()
                # print(window,'extended window!!!')
        # print('I want to reunite')
        for key, win_array in dictionary.items():
            # print("I am in for")
            i = 0
            while i < len(win_array) - 1:
                # print('I am in while')
                if win_array[i].is_crossed(win_array[i + 1]):
                    # print(win_array[i].is_crossed(win_array[i+1]),'is crossed')
                    win_array[i].get_united_window(win_array[i + 1])
                    # print('get_united')
                    win_array.remove(win_array[i + 1])
                else:
                    i += 1

        return dictionary

    def unite_extended_limit_offset(self, query, win_size, limit=3, offset=0):
        '''
        This function unites extended windows in a dictionary(this dictionary
        contains only limited number of documents starting from offset)
        It takes query and win_size, makes a dictionary from query,
        than makes a new one but with windows as values,
        extends these windows and finally unites them once again
        @param query: string query
        @param win_size: size of a window
        @param limit: number of documents to return
        @param offset: from which document to start
        @return: dictionary with extended and reunated windows 
        '''

        if not isinstance(query, str) or not isinstance(win_size, int):
            raise TypeError('Input has an unappropriate type! %s, %s' %
                            (query, win_size))
        if not isinstance(limit, int) or not isinstance(offset, int):
            raise TypeError('Input has an unappropriate type!')

        # теперь я использую новую функцию поиска по нескольким токенам,
        # которая учитывает лимит и оффсет
        to_dict = self.get_dict_many_tokens_limit_offset(query, limit, offset)
        # print(to_dict,'to_dict')
        dictionary = self.unite_all(to_dict, win_size)
        # print(dictionary,'dictionary')
        for value in dictionary.values():
            # print(value,'value')
            for window in value:
                # если функция только модифицирует и ничего не возвращает
                # вызывай ее вот так и не путай! здесь я расширяю окно до границ предложения
                window.extend_window()
                # print(window,'extended window!!!')
        # print('I want to reunite')
        for key, win_array in dictionary.items():
            # print("I am in for")
            i = 0
            while i < len(win_array) - 1:
                # print('I am in while')
                if win_array[i].is_crossed(win_array[i + 1]):
                    # print(win_array[i].is_crossed(win_array[i+1]),'is crossed')
                    win_array[i].get_united_window(win_array[i + 1])
                    # print('get_united')
                    win_array.remove(win_array[i + 1])
                else:
                    i += 1

        return dictionary

    def get_context_gen(self, query, win_size, limit, offset):
        '''
        This function uses a generator of context windows
        to produce a dictionary with windows generator
        windows are extended and reunited
        @param query: string query
        @param win_size: size of a window
        @param limit: number of documents to return
        @param offset: from which document to start
        @return: dictionary with a generator of extended and reunated windows
        '''

        if not isinstance(query, str) or not isinstance(win_size, int):
            raise TypeError('Input has an unappropriate type! %s, %s' %
                            (query, win_size))
        if not isinstance(limit, int) or not isinstance(offset, int):
            raise TypeError('Input has an unappropriate type!')

        # получаю словарь вида имя файла - генератор позиций
        position_gen_dict = self.get_dict_many_tokens_limit_offset_generator(
            query, limit, offset)
        # делаю словарь вида имя файла - генератор окон
        window_gen_dict = dict()
        for filename in position_gen_dict:
            window_gen_dict[filename] = self.context_generator(
                filename, position_gen_dict[filename], win_size)
        # делаю словарь вида имя файла - генератор расширенных
        # и объединенных окон
        context_dict = dict()
        # print('i am here')
        for filename in window_gen_dict:
            # print('i am extending')
            context_dict[filename] = self.context_gen_uniter(
                window_gen_dict[filename])
            # print(context_dict[filename],'the result of extension')
        # это результат работы данной функции
        return context_dict

    def get_sentence_gen(self, query, win_size, limit, offset):
        '''
        This function uses a generator of sentences
        to produce a dictionary with sentence generator
        sentences are extended and reunited
        @param query: string query
        @param win_size: size of a window
        @param limit: number of documents to return
        @param offset: from which document to start
        @return: dictionary with a generator of extended and reunated sentences
        '''
        if not isinstance(query, str) or not isinstance(win_size, int):
            raise TypeError('Input has an unappropriate type! %s, %s' %
                            (query, win_size))
        if not isinstance(limit, int) or not isinstance(offset, int):
            raise TypeError('Input has an unappropriate type!')

        # получаю словарь вида имя файла - генератор расширенных
        # и объединенных окон
        context_dict = self.get_context_gen(query, win_size, limit, offset)
        print(list(context_dict), 'context_dict')
        # создаю словарь вида имя файла - генератор предложений
        sentence_dict = dict()
        for filename in context_dict:
            sentence_dict[filename] = self.sentence_generator(
                context_dict[filename])
            print(list(sentence_dict[filename]), 'sentence_dict[filename]')
        # создаю результурующий словарь с расширенными
        # и объединенными предложеиями
        final_sentence_dict = dict()
        for filename in sentence_dict:
            try:
                final_sentence_dict[filename] = self.sentence_generator_uniter(
                    sentence_dict[filename])
                print(list(final_sentence_dict[filename]), 'final!!!!')
            except StopIteration:
                break
        # это результат работы данной функции
        return final_sentence_dict

    def query_search(self, query, win_size):
        '''
        This function performs searching a query in database and returs
        a dictionary filemname:query in string format
        @param query: query to search
        @param win_size: a size of a context window
        @return: dictionary {filename: [query(str)]}
        '''
        if not isinstance(query, str) or not isinstance(win_size, int):
            raise TypeError('Input has an unappropriate type! %s, %s' %
                            (query, win_size))

        output_dict = {}
        dictionary = self.unite_extended(query, win_size=1)
        # print(dictionary,'dictionary')
        for key, value in dictionary.items():
            # print(value,'value')
            for window in value:
                string = window.highlight_window()
                # print(string,'string')
                output_dict.setdefault(key, []).append(string)
        # print(output_dict,'dict')
        return output_dict

    def query_search_modified(self, query, win_size=1, limit=3, offset=0):
        '''
        This function performs searching a query in database and returs
        a dictionary filemname:query in string format
        It uses a new search function named def unite_extended_limit_offset() and
        for that it takes limit and offset as it's arguments
        @param query: query to search
        @param win_size: a size of a context window
        @param limit: number of documents to return
        @param offset: from which document to start
        @return: dictionary {filename: [query(str)]}
        '''
        if not isinstance(query, str) or not isinstance(win_size, int):
            raise TypeError('Input has an unappropriate type! %s, %s' %
                            (query, win_size))

        output_dict = {}
        dictionary = self.unite_extended_limit_offset(query, win_size, limit,
                                                      offset)
        # print(dictionary,'dictionary')
        for key, value in dictionary.items():
            # print(value,'value')
            for window in value:
                string = window.highlight_window()
                # print(string,'string')
                output_dict.setdefault(key, []).append(string)
        # print(output_dict,'dict')
        return output_dict

    def qulim_search(self, query, win_size, limit, offset, doc_limof):
        '''
        This function performs searching a query in database and returs
        a dictionary filemname:query in string format
        @param query: query to search
        @param win_size: a size of a context window
        @param limit: max number of documents to show
        @param offset: document number to start with
        @param doc_limof: list of pairs that show limit and offset of each concrete document,
        no more quotes can be shown than this doclimit
        @return: dictionary {filename: [query(str)]}
        '''

        if not isinstance(query, str) or not isinstance(
                limit, int) or not isinstance(offset, int):
            raise TypeError('Input has an unappropriate type! %s, %s, %s' %
                            (query, limit, offset))

        # dictionary for results
        output_dict = dict()
        # number of document
        qunum = 0
        dictionary = self.unite_extended(query, win_size)
        # print(dictionary, 'dictionary')
        # print(doc_limof,'doc_limof')
        for number, filename in enumerate(sorted(dictionary)):
            #print('I am in for circle!')
            #print(filename,'filename')
            #print(number,'number')
            if number == limit + offset:
                break
            if number >= offset and number < limit + offset:
                #print(number,'number again!!!')
                # тут я создаю список для каждого файла
                output_dict.setdefault(filename, [])
                # get all the qoutes in file
                all_quotes = dictionary[filename]
                # print(all_quotes,'all_quotes')
                # limit for document
                qulim = doc_limof[qunum][0]
                # print(qulim, 'qulim')
                # offset for document
                quset = doc_limof[qunum][1]
                #print(quset,'quset')
                for num, quote in enumerate(all_quotes):
                    #print('I am in the second for circle!!!')
                    #print(num,'num!!!')
                    if num == qulim + quset:
                        break
                    if num >= quset and num < qulim + quset:
                        #print(quset,'quset')
                        #print(qulim + quset,'qulim + quset')
                        output_dict[filename].append(quote.highlight_window())
                        #print("I got a quote!")
                        # print(quote,'quote!!!')
                qunum += 1
        # print(output_dict, 'output_dict')
        return output_dict

    def qulim_search_modified(self,
                              query,
                              win_size=1,
                              limit=3,
                              offset=0,
                              doc_limof=[(3, 0), (3, 0), (3, 0)]):
        '''
        This function performs searching a query in database and returs
        a dictionary filemname:query in string format
        @param query: query to search
        @param win_size: a size of a context window
        @param limit: max number of documents to show
        @param offset: document number to start with
        @param doc_limof: list of pairs that show limit and offset of each concrete document,
        no more quotes can be shown than this doclimit
        @return: dictionary {filename: [query(str)]} тут тоже генератор
        '''

        if not isinstance(query, str) or not isinstance(
                limit, int) or not isinstance(offset, int):
            raise TypeError('Input has an unappropriate type! %s, %s, %s' %
                            (query, limit, offset))

        # dictionary for results
        output_dict = dict()
        # number of document
        qunum = 0
        dictionary = self.unite_extended_limit_offset(query, win_size, limit,
                                                      offset)
        # мне не нужно сравнивать с лимитами и офсетами по документам,
        # так как я уже использую новую функцию, где они уже учтены
        # поэтому я просто перебираю файлы в отсортированном словаре
        for filename in sorted(dictionary):
            # тут я создаю список для каждого файла
            output_dict.setdefault(filename, [])
            # достаю все limit цитат из словаря по данному файлу
            all_quotes = dictionary[filename]
            # print(all_quotes,'all_quotes')
            # limit for document
            qulim = doc_limof[qunum][0]
            if not qulim:
                qulim = 3
            # print(qulim, 'qulim')
            # offset for document
            quset = doc_limof[qunum][1]
            if not quset:
                quset = 0
            #print(quset,'quset')
            for num, quote in enumerate(all_quotes):
                #print('I am in the second for circle!!!')
                #print(num,'num!!!')
                if num == qulim + quset:
                    break
                if num >= quset and num < qulim + quset:
                    #print(quset,'quset')
                    #print(qulim + quset,'qulim + quset')
                    output_dict[filename].append(quote.highlight_window())
                    #print("I got a quote!")
                    # print(quote,'quote!!!')
            qunum += 1
        # print(output_dict, 'output_dict')
        return output_dict

    def qulim_search_modified_gen(self,
                                  query,
                                  win_size=1,
                                  limit=3,
                                  offset=0,
                                  doc_limof=[(3, 0), (3, 0), (3, 0)]):
        """
        This function performs searching a query in database and returs
        a dictionary filemname:query in string format
        It uses generators to work faster than the previous function
        named qulim_search_modified
        @param query: query to search
        @param win_size: a size of a context window
        @param limit: max number of documents to show
        @param offset: document number to start with
        @param doc_limof: list of pairs that show limit and offset of each concrete document,
        no more quotes can be shown than this doclimit
        @return: dictionary {filename: [query(str)]} 
        """

        if not isinstance(query, str) or not isinstance(
                limit, int) or not isinstance(offset, int):
            raise TypeError('Input has an unappropriate type! %s, %s, %s' %
                            (query, limit, offset))

        # dictionary for results
        output_dict = dict()
        # number of document
        qunum = 0
        # using brand new function with generator))
        dictionary = self.get_sentence_gen(query, win_size, limit, offset)
        for filename in sorted(dictionary):
            qulim = doc_limof[qunum][0]
            if not qulim:
                qulim = 3
            # print(qulim, 'qulim')
            # offset for document
            quset = doc_limof[qunum][1]
            if not quset:
                quset = 0
            # print(quset,'quset')
            output_dict.setdefault(filename, [])
            for item in range(quset):
                next(dictionary[item])
            for item in range(qulim):
                try:
                    output_dict[filename].append(
                        next(dictionary[item]).highlight_window())

                except StopIteration:
                    break

            qunum += 1
        return output_dict

    def position_generator(self, lists):
        '''
        This function generates positions
        @param lists: list of lists of positions
        It chooses a position with a min frequency upon firsts elements of each list given and yeilds it
        '''
        # превращаю списки в итераторы. итератор - генератор, осуществляющий итерацию.
        iters = [iter(x) for x in lists]
        # список с первыми элементами списков
        firsts = [next(it) for it in iters]
        # print(firsts,'firsts')
        while firsts:
            position_iter = min(firsts)
            yield position_iter
            # print(position_iter,'position_iter')
            # номер массива, из которого я взяла этот элемент
            position_iter_pos = firsts.index(position_iter)
            try:
                # переходим к следующему элементу в этом списке
                firsts[position_iter_pos] = next(iters[position_iter_pos])
            except StopIteration:
                # если один из списков закончился, то удаляем и возвращаем удаленный элемент
                # те первый элемент такого списка и его итератор нам больше не нужны и мы их удаляем
                iters.pop(position_iter_pos)
                firsts.pop(position_iter_pos)

    def context_generator(self, filename, position_generator, win_size):
        """
        This function creates context windows from a given
        file using the position generator
        @param filename: a name of a file
        @param position_generator: generator which generates positions
        @param win_size: a size of a future context window
        @return: contexts windows, i.e. objects of Context_Window class
        """
        if not isinstance(filename, str) or not isinstance(win_size, int):
            raise TypeError('Input has an unappropriate type!')
        for pos in position_generator:
            window = Context_Window.get_window(filename, pos, win_size)
            yield window

    def context_gen_uniter(self, context_generator):
        """
        This function checks if generated windows intersect and unites them
        @param context_generator: generator of context windows
        @return: united context windows
        """
        # делаю из входных данных итератор, чтобы итераторить))
        iterator = context_generator.__iter__()
        # первое окно в итераторе, начало прохода
        previous = iterator.__next__()
        # print(previous,'previous')
        for window in context_generator:
            # заворачиваю в блок, чтобы избежать ошибки
            try:
                # второе окно после первого
                next_window = iterator.__next__()
                # print(next_window,'next_window')
                # проверяю на пересечение и если что - объединяю
                if previous.is_crossed(next_window):
                    previous.get_united_window(next_window)
                    # print('united window yielded')
                else:
                    yield previous
                    # print(previous,'I just yield window')
                    previous = next_window
            except StopIteration:
                break
                # print('I stop the iteration')
        yield previous

    def sentence_generator(self, context_gen_uniter):
        """
       This function generates sentences using the context_gen_uniter
       @param context_gen_uniter: generator of united context windows
       @return: extended windows
       """
        for window in context_gen_uniter:
            window.extend_window()
            yield window

    def sentence_generator_uniter(self, sentence_generator):
        """
       This function checks if generated sentences intersect and unites them
       @param sentence_generator: generator of sentences
       @return: extended windows after thier union
       """
        iterator = sentence_generator.__iter__()
        previous = iterator.__next__()
        for extended_window in sentence_generator:
            print(extended_window, 'extended_window')
            try:
                # если на вход было всего одно предложение, то в этом месте будет StopIteration?
                next_window = iterator.__next__()
                print(next_window, 'next_window')
                if previous.is_crossed(next_window):
                    previous.get_united_window(next_window)
                    print(previous, 'i united windows!')
                else:
                    yield previous
                    print(previous,
                          'I could not unite so this is just a window')
                    previous = next_window
            except StopIteration:
                break
                print('I stop the iteration')
        yield previous

Пример #10

Показать файл

class Context_Window(object):
    """
    class Context_Window 
    """
    tokenizator = Tokenizator()

    def __init__(self, string, positions, win_start, win_end):
        """
        Constructor of a context window
        @param positions: positions of tokens
        @param string: string representation of a token
        @param win_start: position where window starts
        @param win_end: position where window ends
        """

        self.string = string
        self.positions = positions
        self.win_start = win_start
        self.win_end = win_end

    def __eq__(self, window):

        return self.string == window.string and self.positions == window.positions and self.win_start == window.win_start and self.win_end == window.win_end

    def __repr__(self):

        return str(self.string) + ' ' + str(self.positions) + ' ' + str(
            self.win_start) + ' ' + str(self.win_end)

    @classmethod
    def get_window(cls, filename, position, win_size):
        """
        This function returns a context window of a given token's position
        @param filename: a name of a file where token is to be found
        @param position: a position of a token
        @param win_size: desirable size of the context window
        @return: a context window
        """
        if not isinstance(filename, str) or not isinstance(win_size, int):
            raise TypeError('Input has an unappropriate type!')
        positions = []
        positions.append(position)
        win_end = 0
        win_start = 0
        string = None
        str_num = position.lnumber
        my_file = open(filename)
        for lnumber, my_string in enumerate(my_file):
            if lnumber == str_num:
                string = my_string
                break

        if string == None:
            my_file.close()
            raise TypeError('This string was not found!')

        for tok_num, token in enumerate(
                cls.tokenizator.token_gen(string[position.start:])):
            if tok_num == 0:
                win_end = position.end
            if tok_num == win_size:
                win_end = token.position + len(token.s) + position.start
                break

        for tok_num, token in enumerate(
                cls.tokenizator.token_gen(string[:position.end][::-1])):
            if tok_num == win_size:
                win_start = position.end - token.position - len(token.s)
                break

        my_file.close()
        return cls(string, positions, win_start, win_end)

    def is_crossed(self, window_B):
        '''
        This function checks if windows are crossed
        @param window_B: the second window
        @return: True or False
        '''
        if not isinstance(window_B, Context_Window):
            raise TypeError('Input has an unappropriate type!')
        if self.win_start <= window_B.win_end and self.win_end >= window_B.win_start and self.positions[
                0].lnumber == window_B.positions[0].lnumber:
            return True
        if self.win_start == window_B.win_start and self.win_end == window_B.win_end and self.positions[
                0].lnumber == window_B.positions[0].lnumber:
            return True
        else:
            return False

    def get_united_window(self, window_B):
        '''
        This function unites two windows
        @param window_B: the second window
        It changes self so that is has new positions and returns nothing!!
        '''

        if not isinstance(window_B, Context_Window):
            raise TypeError('Input has an unappropriate type!')
        '''for position in window_B.positions:
            if position not in self.positions:
                self.positions.append(position)'''
        self.positions.extend(window_B.positions)
        self.win_start = min(window_B.win_start, self.win_start)
        self.win_end = max(window_B.win_end, self.win_end)

    def extend_window(self):
        '''
        This function extends a given window to sentence
        @return: an extended window
        '''
        to_right = self.string[self.win_end:]
        to_left = self.string[:self.win_start + 1][::-1]
        left = PATTERN_LEFT.search(to_left)
        right = PATTERN_RIGHT.search(to_right)
        if left is None:
            self.win_start = 0
        else:
            self.win_start -= left.start()
        if right is None:
            self.win_end = len(self.string)
        else:
            self.win_end += right.start() + 1

    def highlight_window(self):
        '''
        This function takes a substring of window string,
        which corresponds to the window size and highlights it 
        '''
        win_string = self.string[self.win_start:self.win_end]
        fin = '</b>'
        st = '<b>'
        for position in reversed(self.positions):
            end = position.end - self.win_start
            begin = position.start - self.win_start
            win_string_one = win_string[:end] + fin + win_string[end:]
            win_string = win_string_one[:begin] + st + win_string_one[begin:]
        return win_string

Пример #11

Показать файл

class TestMyCode(unittest.TestCase):
    # making a unit of Tokenizator class
    def setUp(self):
        self.x = Tokenizator()

    # the test itself
    def test_isgenerator(self):
        result = self.x.tokens_generator(' Ф 12 !!! @ # Alina is a student)))')
        self.assertIsInstance(result, Generator)

    def test_begins_with_no_alpha(self):
        result = list(
            self.x.tokens_generator(' ф 12 !!! @ # Alina is a student)))'))
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].s, 'ф')
        self.assertEqual(result[0].position, 1)
        self.assertEqual(result[1].s, 'Alina')
        self.assertEqual(result[1].position, 14)

    def test_begins_with_alpha(self):
        result = list(
            self.x.tokens_generator('ф 12 !!! @ # Alina is a student)))'))
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].s, 'ф')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[1].s, 'Alina')
        self.assertEqual(result[1].position, 13)

    def test_ends_with_alpha(self):
        result = list(
            self.x.tokens_generator('ф 12 !!! @ # Alina is a student)))abc'))
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 6)
        self.assertEqual(result[0].s, 'ф')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[5].s, 'abc')
        self.assertEqual(result[5].position, 34)

    def test_ends_with_no_alpha(self):
        result = list(
            self.x.tokens_generator('ф 12 !!! @ # Alina is a student)))'))
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].s, 'ф')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[4].s, 'student')
        self.assertEqual(result[4].position, 24)

    def test_MyError_number(self):
        with self.assertRaises(ValueError):
            list(self.x.tokens_generator(12))

    def test_MyError_notList(self):
        s = [1, 2, 3, 'this is my string']
        with self.assertRaises(ValueError):
            list(self.x.tokens_generator(s))

    def test_Function_begins_with_no_alpha(self):
        result = self.x.tokenize(' ф 12 !!! @ # Alina is a student)))')
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].s, 'ф')
        self.assertEqual(result[0].position, 1)
        self.assertEqual(result[1].s, 'Alina')
        self.assertEqual(result[1].position, 14)

    def test_function_begins_with_alpha(self):
        result = self.x.tokenize('ф 12 !!! @ # Alina is a student)))')
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].s, 'ф')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[1].s, 'Alina')
        self.assertEqual(result[1].position, 13)

    def test_function_ends_with_alpha(self):
        result = self.x.tokenize('ф 12 !!! @ # Alina is a student)))abc')
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 6)
        self.assertEqual(result[0].s, 'ф')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[5].s, 'abc')
        self.assertEqual(result[5].position, 34)

    def test_function_ends_with_no_alpha(self):
        result = self.x.tokenize('ф 12 !!! @ # Alina is a student)))')
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].s, 'ф')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[4].s, 'student')
        self.assertEqual(result[4].position, 24)

    def test_MyError_function_number(self):
        with self.assertRaises(ValueError):
            self.x.tokenize(12)

    def test_MyError_function_notList(self):
        s = [1, 2, 3, 'this is my string']
        with self.assertRaises(ValueError):
            self.x.tokenize(s)

    def test_isgenerator_for_token_gen(self):
        result = self.x.token_gen(' Ф 12 !!! @ # Alina is a student)))')
        self.assertIsInstance(result, Generator)

    def test_my_token_gen(self):
        result = list(self.x.token_gen(' Ф 12 !!! @ # Alina is a student)))'))
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 6)
        self.assertEqual(result[0].s, 'Ф')
        self.assertEqual(result[0].tp, 'alpha')
        self.assertEqual(result[0].position, 1)
        self.assertEqual(result[1].s, '12')
        self.assertEqual(result[1].tp, 'digit')
        self.assertEqual(result[1].position, 3)
        self.assertEqual(result[5].s, 'student')
        self.assertEqual(result[5].tp, 'alpha')
        self.assertEqual(result[5].position, 25)

    def test_MyError_token_gen_number(self):
        with self.assertRaises(ValueError):
            list(self.x.token_gen(12))

    def test_MyError_token_gen_notList(self):
        s = [1, 2, 3, 'this is my string']
        with self.assertRaises(ValueError):
            list(self.x.token_gen(s))

    def test_empty_string(self):
        result = ''
        self.assertEqual(len(result), 0)