def parse_buffer(self, undecoded_text_buffer, encoding=""): from ling_utils import span_tokenize tokens = span_tokenize(undecoded_text_buffer, encoding=encoding) from ling_utils import unify_word for token in tokens: token.token = unify_word(token.token) return tokens
def flush_buffer(): log_out.write("flushing buffer..\n") log_out.flush() for token, codes in words_index[0].items(): unified_token = unify_word(token.decode("windows-1251")) reducer_index = abs(hash(unified_token)) % len(reducers_pool) reducers_pool[reducer_index].write(unified_token + "\t" + " ".join(str(code) for code in codes) + "\n") words_index[0] = {} words_in_buffer[0] = 0
def find_title(self, title_query): matched_objects = None for match in span_tokenize_windows1251(title_query): token = unify_word(match[-1].decode("windows-1251")) if not token in self.title_index: return [] if matched_objects == None: matched_objects = set(self.title_index[token]) else: matched_objects &= set(self.title_index[token]) if not matched_objects: return [] return matched_objects
def find_mentions_of_author_and_title(self, query): tokens = [unify_word(match[-1].decode("windows-1251")) \ for match in span_tokenize_windows1251(query.encode("windows-1251"))[:10]] tokens = set(tokens) books_scores = {} for token in tokens: if token in self.title_index: for obj_id in set(self.title_index[token]): books_scores.setdefault(obj_id, 0) books_scores[obj_id] += 1 if token in self.author_index: for obj_id in set(self.author_index[token]): books_scores.setdefault(obj_id, 0) books_scores[obj_id] += 1 import math min_match = math.ceil(len(tokens) * 0.6) matched_books = [(matched_tokens, book) for book, matched_tokens in books_scores.items() \ if matched_tokens >= min_match] matched_books.sort(reverse=True) matched_books = [book for _, book in matched_books] return matched_books
def add_title(self, title, object_id): for match in span_tokenize_windows1251(title): token = unify_word(match[-1].decode("windows-1251")) self.title_index.setdefault(token, []).append(object_id)
def get_surname(author_str_windows1251): words = [unify_word(match[-1].decode("windows-1251")) for match in span_tokenize_windows1251(author_str_windows1251)] if not words: return "" surname = max((len(word), word) for word in words)[1] return surname