Python Tokenizer 예제들, my_tokenizer_combined.Tokenizer Python 예제들

예제 #1

0

파일 보기

    def create_db_index(self, db_name, file_name):
        """
        The method is very much alike with the method 'create_complex_index_from_file',
        but it adds a computed index to a database rather than to a dict with a complex
        structure.
        """

        if (not isinstance(db_name, str)) or (not isinstance(file_name, str)):
            raise ValueError("This method takes two strings as an input: \
                              names (paths) of a db and a file. ")

        with shelve.open(db_name, writeback=True) as db:

            tokenizer = t.Tokenizer()
            # the same as in the method create_complex_index_from_file
            with open(file_name, encoding="utf-8") as file:
                for i, string in enumerate(file):

                    # for each token out of a generator
                    for token in tokenizer.iter_tokenize(string):

                        if token.kind == "alpha" or token.kind == "digit":

                            # internal - is a dict to store positions in one file
                            internal_dict = db.setdefault(token.word, {})
                            positions_list = internal_dict.setdefault(
                                file.name, [])
                            positions_list.append(
                                File_Position(token.start, token.end, i + 1))

예제 #2

0

파일 보기

    def test_13(self):
        """
        Tests how generator tokenizer works for a regular string with
        all types of tokens.
        """
        t = my_tokenizer_combined.Tokenizer()
        result = list(t.iter_tokenize("a string: 12,$,3"))
        self.assertEqual(len(result), 10)
        self.assertEqual(result[0].word, "a")
        self.assertEqual(result[0].kind, "alpha")
        self.assertEqual(result[1].word, " ")
        self.assertEqual(result[1].kind, "space")
        self.assertEqual(result[1].length, 1)
        self.assertEqual(result[3].word, ":")
        self.assertEqual(result[3].kind, "punct")
        self.assertEqual(result[3].length, 1)
        self.assertEqual(result[5].word, "12")
        self.assertEqual(result[5].kind, "digit")
        self.assertEqual(result[5].length, 2)
        self.assertEqual(result[7].word, "$")
        self.assertEqual(result[7].kind, "other")
        self.assertEqual(result[7].length, 1)

        self.assertIsInstance(result[0], my_tokenizer_combined.Advanced_Token)
        self.assertIsInstance(result[1], my_tokenizer_combined.Advanced_Token)
        self.assertIsInstance(result[3], my_tokenizer_combined.Advanced_Token)
        self.assertIsInstance(result[5], my_tokenizer_combined.Advanced_Token)
        self.assertIsInstance(result[7], my_tokenizer_combined.Advanced_Token)

예제 #3

0

파일 보기

    def create_complex_index_from_file(self, path):
        """
        The method takes a name (a path) of a file as an input and
        creates a dict with a token as a key and a file and a list
        of its 'file' positions as a value.
        """

        if not isinstance(path, str):
            raise ValueError("This method takes a string containing \
                              the name (a path) of a file as an input.")

        our_dict = {}
        tokenizer = t.Tokenizer()
        with open(path, encoding="utf-8") as file:
            for i, string in enumerate(file):

                # for each token out of a generator
                for token in tokenizer.iter_tokenize(string):

                    if token.kind == "alpha" or token.kind == "digit":

                        # internal - is a dict to store positions in one file
                        internal_dict = our_dict.setdefault(token.word, {})
                        positions_list = internal_dict.setdefault(
                            file.name, [])
                        positions_list.append(
                            File_Position(token.start, token.end, i + 1))

        return our_dict

예제 #4

0

파일 보기

    def create_long_index_from_file(self, path):
        """
        The method takes a name (a path) of a file as an input and
        creates a dict with a token as a key and a list of its 'long'
        positions as a value.
        """

        if not isinstance(path, str):
            raise ValueError("This method takes a string containing \
                              the name (a path) of a file as an input.")

        our_dict = {}
        with open(path, encoding="utf-8") as file:
            tokenizer = t.Tokenizer()
            for i, string in enumerate(file):

                # for each object in a generator
                for obj in tokenizer.iter_tokenize(string):

                    if obj.kind == "alpha" or obj.kind == "digit":
                        l = our_dict.setdefault(obj.word, [])
                        l.append(
                            Long_Position(obj.start, obj.end, i + 1,
                                          file.name))

        return our_dict

예제 #5

0

파일 보기

 def test_12(self):
     """
     For a non-string (integer) object as an input.
     """
     t = my_tokenizer_combined.Tokenizer()
     with self.assertRaises(ValueError):
         t.advanced_tokenize(5)

예제 #6

0

파일 보기

    def get_context(self, file, file_pos, length):
        """
        Returns a Context_Window instance for a word, given its position in a file.

        Takes the following arguments:
        file - the file name
        file_pos - a position of a word in a file
        length - the length of a context window
        """
        if not isinstance(file, str):
            raise ValueError(
                "The first argument should be a string with file name.")
        if not isinstance(file_pos, my_indexer_combined.File_Position):
            raise ValueError(
                "The second argument should be a File_Position instance.")
        if not isinstance(length, int):
            raise ValueError("The third argument should be an integer number.")

        tokenizer = my_tokenizer_combined.Tokenizer()

        with open(file, encoding="utf-8") as f:
            # tokenize a sub-string to the right and an inverted sub-string to the left
            for i, string in enumerate(f):
                # we need to count lines to check smth later
                if file_pos.string_numb == i + 1:
                    right_string = string

        # string[start:end:-1] returns an inverted string
        j = 0  # j - is just a counter
        start = 1  # if the 'lenght' of a context will go out of the string's range
        if file_pos.start != 1:  # otherwise start = 1 and we don't need to alter it
            for token in tokenizer.iter_tokenize(right_string[file_pos.start -
                                                              2::-1]):
                if j < length:
                    if token.kind == "alpha" or token.kind == "digit":
                        j += 1
                    continue

                # cause we go through an inverted string
                start = file_pos.start - token.end + 1
                break

        n = 0
        # in case the 'lenght' of a context goes out of the string's range.
        end = len(right_string
                  ) - 1  # cause the last symbol is '\n' - we don't need it
        for token in tokenizer.iter_tokenize(right_string[file_pos.end:]):
            if n < length:
                if token.kind == "alpha" or token.kind == "digit":
                    n += 1
                continue

            # cause 'token.end' is got from a substring
            end = file_pos.end + token.end - 1
            break

        window = Context_Window([file_pos], start, end, right_string[:-1])

        return window

예제 #7

0

파일 보기

 def test_11(self):
     """
     For an empty string.
     """
     t = my_tokenizer_combined.Tokenizer()
     result = t.advanced_tokenize("")
     self.assertEqual(len(result), 0)
     self.assertEqual(result, [])

예제 #8

0

파일 보기

파일: search_engine.py 프로젝트: kr-ann/search_engine

    def the_second_simplest(self, query):
        """
        A method that returns a dict with positions of tokens from the query in files. 

        It takes a string as an input, tokenizes it, and returns a dict with names
        of files (that contain all tokens of the input) as keys and lists of their
        positions in those files as values.
        """

        if not isinstance(query, str):
            raise ValueError("An argument should be a string.")

        dict_for_all = {}
        t = my_tokenizer_combined.Tokenizer()
        tokens = t.iter_tokenize(query.strip())  # delete first and last spaces

        # intersetion of keys - files, containing all tokens of the input.
        # here we collect them and write to the variable set_for_all
        for i, token in enumerate(tokens):
            if token.kind == "alpha" or token.kind == "digit":
                if i == 0:
                    set_for_all = set(self.the_simplest(token.word))

                set_for_all &= set(self.the_simplest(token.word))

        # here we create the final dict
        tokens = t.iter_tokenize(query)
        for token in tokens:
            if token.kind == "alpha" or token.kind == "digit":
                for file in set_for_all:
                    conjunction = dict_for_all.setdefault(file, [])
                    # the_simplest method returns a dict with file names as keys
                    conjunction += self.the_simplest(token.word)[file]

        # here for each key we sort its values' list (i.e. positions' list)
        for key in dict_for_all:
            dict_for_all[key].sort()

        ### NEW ###

        # here we go through all positions (i.e. values for each key that are file names)
        # and delete those which are the same
        for key in dict_for_all:
            new_positions = dict_for_all[key]  # they are sorted
            i = 0
            while i < len(new_positions) - 1:
                if new_positions[i] == new_positions[i + 1]:
                    new_positions.pop(i)
                else:
                    i += 1
            dict_for_all[key] = new_positions

        return dict_for_all

예제 #9

0

파일 보기

 def test_6(self):
     """
     For only one punctuation mark as an input string.
     """
     t = my_tokenizer_combined.Tokenizer()
     result = t.advanced_tokenize("!")
     self.assertEqual(len(result), 1)
     self.assertEqual(result[0].word, "!")
     self.assertEqual(result[0].start, 1)
     self.assertEqual(result[0].end, 1)
     self.assertEqual(result[0].length, 1)
     self.assertEqual(result[0].kind, "punct")
     self.assertIsInstance(result[0], my_tokenizer_combined.Advanced_Token)

예제 #10

0

파일 보기

 def test_7(self):
     """
     For only one aplha-token as an input string.
     """
     t = my_tokenizer_combined.Tokenizer()
     result = t.advanced_tokenize("слово")
     self.assertEqual(len(result), 1)
     self.assertEqual(result[0].word, "слово")
     self.assertEqual(result[0].start, 1)
     self.assertEqual(result[0].end, 5)
     self.assertEqual(result[0].length, 5)
     self.assertEqual(result[0].kind, "alpha")
     self.assertIsInstance(result[0], my_tokenizer_combined.Advanced_Token)

예제 #11

0

파일 보기

 def test_9(self):
     """
     For only one digit-token as an input string.
     """
     t = my_tokenizer_combined.Tokenizer()
     result = t.advanced_tokenize("1")
     self.assertEqual(len(result), 1)
     self.assertEqual(result[0].word, "1")
     self.assertEqual(result[0].start, 1)
     self.assertEqual(result[0].end, 1)
     self.assertEqual(result[0].length, 1)
     self.assertEqual(result[0].kind, "digit")
     self.assertIsInstance(result[0], my_tokenizer_combined.Advanced_Token)

예제 #12

0

파일 보기

 def test_10(self):
     """
     For only one "other" unicode symbol as an input string.
     """
     t = my_tokenizer_combined.Tokenizer()
     result = t.advanced_tokenize("$")
     self.assertEqual(len(result), 1)
     self.assertEqual(result[0].word, "$")
     self.assertEqual(result[0].start, 1)
     self.assertEqual(result[0].end, 1)
     self.assertEqual(result[0].length, 1)
     self.assertEqual(result[0].kind, "other")
     self.assertIsInstance(result[0], my_tokenizer_combined.Advanced_Token)

예제 #13

0

파일 보기

    def the_second_simplest(self, query, doc_limit, doc_offset):  #### NEW ####
        """
        A method that returns a dict with GENERATOR of positions of tokens from the
        query in files. 

        It takes a string 'query' as an input, tokenizes it, and returns a dict with
        names of files (that contain all tokens of the input) as keys and GENERATORS from
        lists of positions in those files as values.
        Its arguments also include 'doc_limit' and 'doc_offset', which indicate what
        part of documents will be shown on the final page via our server.
        """
        if not isinstance(query, str):
            raise ValueError("An argument should be a string.")

        dict_for_all = {}
        t = my_tokenizer_combined.Tokenizer()
        tokens = t.iter_tokenize(query.strip())  # delete first and last spaces

        # intersetion of keys - files, containing all tokens of the input.
        # here we collect them and write to the variable set_for_all
        for i, token in enumerate(tokens):
            if token.kind == "alpha" or token.kind == "digit":
                if i == 0:
                    set_for_all = set(self.the_simplest(token.word))

                set_for_all &= set(self.the_simplest(token.word))

        list_from_set = list(set_for_all)
        list_from_set.sort()
        # now we take only part of list_from_set
        new_list = list_from_set[doc_offset:doc_offset + doc_limit]

        ### NEW ###
        # create dict with file names as keys and list of lists as values, where
        # each inner list contains positions in this file for one word from a query
        for file in new_list:
            tokens = t.iter_tokenize(query)
            list_of_lists = []
            for token in tokens:
                current_list = []
                if token.kind == "alpha" or token.kind == "digit":
                    current_list = self.the_simplest(token.word)[file]
                    list_of_lists.append(current_list)
            # for abstract_iterator to deal with context windows we defined __lt__ for them
            dict_for_all[file] = self.abstract_iterator(list_of_lists)

            # here we delete equal positions, otherwise we get "князькнязь..."
            dict_for_all[file] = self.gen_delete_equal(dict_for_all[file])

        return dict_for_all

예제 #14

0

파일 보기

 def test_3(self):
     """
     For a string that starts and (=or) ends with a digit.
     """
     t = my_tokenizer_combined.Tokenizer()
     result = t.advanced_tokenize("1 some string with digits 5")
     self.assertEqual(len(result), 11)
     self.assertEqual(result[0].word, "1")
     self.assertEqual(result[0].length, 1)
     self.assertEqual(result[0].kind, "digit")
     self.assertEqual(result[2].word, "some")
     self.assertEqual(result[2].kind, "alpha")
     self.assertEqual(result[10].word, "5")
     self.assertEqual(result[10].length, 1)
     self.assertEqual(result[10].kind, "digit")

예제 #15

0

파일 보기

 def test_2(self):
     """
     For a string that starts and (=or) ends with a space.
     """
     t = my_tokenizer_combined.Tokenizer()
     result = t.advanced_tokenize("  some string with spaces ")
     self.assertEqual(len(result), 9)
     self.assertEqual(result[0].word, "  ")
     self.assertEqual(result[0].length, 2)
     self.assertEqual(result[0].kind, "space")
     self.assertEqual(result[1].word, "some")
     self.assertEqual(result[1].kind, "alpha")
     self.assertEqual(result[8].word, " ")
     self.assertEqual(result[8].length, 1)
     self.assertEqual(result[8].kind, "space")

예제 #16

0

파일 보기

 def test_5(self):
     """
     For a string that starts and (=or) ends with an "other" unicode symbol.
     """
     t = my_tokenizer_combined.Tokenizer()
     result = t.advanced_tokenize("$some string with \"other\" symols$")
     self.assertEqual(len(result), 13)
     self.assertEqual(result[0].word, "$")
     self.assertEqual(result[0].length, 1)
     self.assertEqual(result[0].kind, "other")
     self.assertEqual(result[1].word, "some")
     self.assertEqual(result[1].kind, "alpha")
     self.assertEqual(result[2].word, " ")
     self.assertEqual(result[2].length, 1)
     self.assertEqual(result[2].kind, "space")
     self.assertEqual(result[12].word, "$")
     self.assertEqual(result[12].length, 1)
     self.assertEqual(result[12].kind, "other")

예제 #17

0

파일 보기

 def test_4(self):
     """
     For a string that starts and (=or) ends with a punctuation mark.
     """
     t = my_tokenizer_combined.Tokenizer()
     result = t.advanced_tokenize("_some string with punctuation_")
     self.assertEqual(len(result), 9)
     self.assertEqual(result[0].word, "_")
     self.assertEqual(result[0].length, 1)
     self.assertEqual(result[0].kind, "punct")
     self.assertEqual(result[1].word, "some")
     self.assertEqual(result[1].kind, "alpha")
     self.assertEqual(result[2].word, " ")
     self.assertEqual(result[2].length, 1)
     self.assertEqual(result[2].kind, "space")
     self.assertEqual(result[8].word, "_")
     self.assertEqual(result[8].length, 1)
     self.assertEqual(result[8].kind, "punct")

예제 #18

0

파일 보기

    def create_index_from_string(self, string):
        """
        Takes a string as an argument and creates a dictionary, which
        contains words or digits as keys and a list of their 'basic'
        positions in the string as values.
        """

        if not isinstance(string, str):
            raise ValueError("This indexer works only with strings. ")

        our_dict = {}

        for obj in t.Tokenizer().iter_tokenize(string):

            # we include only words or digits in the final dict
            if obj.kind == "alpha" or obj.kind == "digit":

                # setdefault returns dict[key], adding a key with the default
                # value if the key is not in the dict yet
                l = our_dict.setdefault(obj.word, [])
                l.append(Basic_Position(obj.start, obj.end))

        return our_dict

예제 #19

0

파일 보기

    def get_window(self, file_pos, right_string, length):
        """
        Returns a Context_Window instance, given a position of a word in a file,
        the corresponding string from the file and the length of the window.
        """
        # this part of code is from get_context method
        tokenizer = my_tokenizer_combined.Tokenizer()
        # string[start:end:-1] returns an inverted string
        j = 0  # j - is just a counter
        start = 1  # if the 'lenght' of a context will go out of the string's range
        if file_pos.start != 1:  # otherwise start = 1 and we don't need to alter it
            for token in tokenizer.iter_tokenize(right_string[file_pos.start -
                                                              2::-1]):
                if j < length:
                    if token.kind == "alpha" or token.kind == "digit":
                        j += 1
                    continue

                # cause we go through an inverted string
                start = file_pos.start - token.end + 1
                break

        n = 0
        # in case the 'lenght' of a context goes out of the string's range.
        end = len(right_string
                  ) - 1  # cause the last symbol is '\r' - we don't need it
        for token in tokenizer.iter_tokenize(right_string[file_pos.end:]):
            if n < length:
                if token.kind == "alpha" or token.kind == "digit":
                    n += 1
                continue

            # cause 'token.end' is got from a substring
            end = file_pos.end + token.end - 1
            break
        # [:-1] - not to include the final '\r'
        return Context_Window([file_pos], start, end, right_string[:-1])