Python unify_word示例

编程语言: Python

命名空间/包名称: ling_utils

方法/功能: unify_word

hotexamples.com的示例: 6

Python unify_word - 已找到6个示例。这些是从开源项目中提取的最受好评的ling_utils.unify_word现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： parsers.py 项目： mavlyutovrus/light_search

 def parse_buffer(self, undecoded_text_buffer, encoding=""):
     from ling_utils import span_tokenize
     tokens = span_tokenize(undecoded_text_buffer, encoding=encoding)
     from ling_utils import unify_word
     for token in tokens:
         token.token = unify_word(token.token)
     return tokens

示例#2

显示文件

文件： build_index_map.py 项目： mavlyutovrus/light_search

 def flush_buffer():
     log_out.write("flushing buffer..\n")
     log_out.flush()
     for token, codes in words_index[0].items():
         unified_token = unify_word(token.decode("windows-1251"))
         reducer_index = abs(hash(unified_token)) % len(reducers_pool)
         reducers_pool[reducer_index].write(unified_token + "\t" + " ".join(str(code) for code in codes) + "\n")
     words_index[0] = {}
     words_in_buffer[0] = 0

示例#3

显示文件

文件： custom_fields_search_engine.py 项目： mavlyutovrus/light_search

 def find_title(self, title_query):
     matched_objects = None
     for match in span_tokenize_windows1251(title_query):
         token = unify_word(match[-1].decode("windows-1251"))
         if not token in self.title_index:
             return []
         if matched_objects == None:
             matched_objects = set(self.title_index[token])
         else:
             matched_objects &= set(self.title_index[token])
         if not matched_objects:
             return []
     return matched_objects

示例#4

显示文件

文件： custom_fields_search_engine.py 项目： mavlyutovrus/light_search

    def find_mentions_of_author_and_title(self, query):
        tokens = [unify_word(match[-1].decode("windows-1251")) \
		    for match in span_tokenize_windows1251(query.encode("windows-1251"))[:10]]
        tokens = set(tokens)
        books_scores = {}
        for token in tokens:
            if token in self.title_index:
                for obj_id in set(self.title_index[token]):
                    books_scores.setdefault(obj_id, 0)
                    books_scores[obj_id] += 1
            if token in self.author_index: 
                for obj_id in set(self.author_index[token]):
                    books_scores.setdefault(obj_id, 0)
                    books_scores[obj_id] += 1
        import math
        min_match = math.ceil(len(tokens) * 0.6)
        matched_books = [(matched_tokens, book) for book, matched_tokens in books_scores.items() \
		                                      if matched_tokens >= min_match]
        matched_books.sort(reverse=True)
        matched_books = [book for _, book in matched_books]
        return matched_books

示例#5

显示文件

文件： custom_fields_search_engine.py 项目： mavlyutovrus/light_search

 def add_title(self, title, object_id):
     for match in span_tokenize_windows1251(title):
         token = unify_word(match[-1].decode("windows-1251"))
         self.title_index.setdefault(token, []).append(object_id)

示例#6

显示文件

文件： custom_fields_search_engine.py 项目： mavlyutovrus/light_search

def get_surname(author_str_windows1251):
    words = [unify_word(match[-1].decode("windows-1251")) for match in span_tokenize_windows1251(author_str_windows1251)]  
    if not words:
        return ""
    surname = max((len(word), word)  for word in words)[1]
    return surname