def postprocess_ml(split, token_counters, model): merged_tokens = [] for token in split: if len(merged_tokens) == 0: merged_tokens.append(token) else: last = merged_tokens[-1] merged = last + token if is_word(merged, token_counters) and not predict_split( model, last, token, token_counters): merged_tokens[-1] = merged else: merged_tokens.append(token) postprocessed = [] for token in merged_tokens: did_split = False for split_pos in range(1, len(token)): prefix = token[:split_pos] suffix = token[split_pos:] if is_word(prefix, token_counters) and is_word(suffix, token_counters) \ and predict_split(model, prefix, suffix, token_counters): postprocessed.append(prefix) postprocessed.append(suffix) did_split = True break if not did_split: postprocessed.append(token) return postprocessed
def combine_mergable_tokens_until_two_correct(tokens, token_counters): token_lists = [] t_i = 0 while t_i < len(tokens): current = tokens[t_i] if not current[0]: token_lists.append((False, [current[1]])) else: words_to_merge = [current[1]] #original_space_positions = [] consecutive_correct = 1 if is_word(current[1], token_counters) else 0 while t_i < len(tokens) - 2: if tokens[t_i + 1][1] == ' ' and tokens[t_i + 2][0]: word = tokens[t_i + 2][1] if is_word(word, token_counters): consecutive_correct += 1 else: consecutive_correct = 0 if consecutive_correct == 2: break #original_space_positions.append(len(word) # + (0 if len(original_space_positions) == 0 # else original_space_positions[-1])) words_to_merge.append(word) t_i += 2 else: break token_lists.append((True, words_to_merge)) t_i += 1 return token_lists
def number_of_nonwords(tokens, word_counters, limit=None): nonwords = 0 for token in tokens: if not is_word(token, word_counters): nonwords += 1 if limit is not None and nonwords > limit: return nonwords return nonwords
def get_split_candidates(sequence, token_counters): candidates = [[sequence]] for i in range(len(sequence)): prefix = sequence[:i] if is_word(prefix, token_counters): follow_up_candidates = get_split_candidates( sequence[i:], token_counters) for follow_up in follow_up_candidates: candidates.append([prefix] + follow_up) return candidates
def get_word_positions(string, word_dict, max_word_len=None): positions = [] if max_word_len is None: max_word_len = len(string) for i in range(len(string)): end = min(len(string), i + max_word_len) + 1 for j in range(i + 1, end): if is_word(string[i:j], word_dict): positions.append((i, j)) return positions
def negative_examples(token_lists, word_counters): neg_c_tok = [] neg_c_pre = [] neg_c_suf = [] for token_list in token_lists: for token in token_list: for split_pos in range(1, len(token)): prefix = token[:split_pos] suffix = token[split_pos:] if is_word(prefix, word_counters) and is_word(suffix, word_counters): token_count = get_count(token, word_counters) prefix_count = get_count(prefix, word_counters) suffix_count = get_count(suffix, word_counters) neg_c_tok.append(token_count) neg_c_pre.append(prefix_count) neg_c_suf.append(suffix_count) X = feature_matrix(neg_c_pre, neg_c_suf, neg_c_tok) y = [0] * len(neg_c_tok) return X, y
def positive_examples(token_lists, word_counters): pos_c_tok = [] pos_c_pre = [] pos_c_suf = [] for token_list in token_lists: for prefix, suffix in zip(token_list[:-1], token_list[1:]): merged = prefix + suffix if is_word(merged, word_counters): merged_count = get_count(merged, word_counters) prefix_count = get_count(prefix, word_counters) suffix_count = get_count(suffix, word_counters) pos_c_tok.append(merged_count) pos_c_pre.append(prefix_count) pos_c_suf.append(suffix_count) X = feature_matrix(pos_c_pre, pos_c_suf, pos_c_tok) y = [1] * len(pos_c_tok) return X, y
def get_best_splits_naive(tokens, word_counters): #Finds the best splits based on number of nonwords and number of operations. orig_nonwords = sum( [0 if is_word(token, word_counters) else 1 for token in tokens]) orig_space_positions = cumsum([len(token) for token in tokens[:-1]]) merged = "".join(tokens) word_positions = get_word_positions(merged, word_counters) candidates = candidates_from_word_positions(merged, word_positions, word_counters, orig_nonwords - 1) # original split as candidate: if len(tokens) == 1: original_split = [(0, len(merged))] else: original_split = [(0, orig_space_positions[0])] for i in range(len(orig_space_positions[:-1])): original_split.append( (orig_space_positions[i], orig_space_positions[i + 1])) original_split.append((orig_space_positions[-1], len(merged))) candidates.append(original_split) candidates = set([tuple(c) for c in candidates]) best_splits = [] best_n_nonwords = len(merged) best_n_operations = len(merged) for c in candidates: c_words = [merged[pos[0]:pos[1]] for pos in c] c_n_nonwords = number_of_nonwords(c_words, word_counters, limit=best_n_nonwords) if c_n_nonwords <= best_n_nonwords: c_n_operations = number_of_operations(c, orig_space_positions) else: c_n_operations = len(merged) if c_n_nonwords < best_n_nonwords \ or (c_n_nonwords == best_n_nonwords and c_n_operations < best_n_operations): best_n_nonwords = c_n_nonwords best_n_operations = c_n_operations best_splits = [c] elif c_n_nonwords == best_n_nonwords and c_n_operations == best_n_operations: best_splits.append(c) best_splits = [list(split) for split in best_splits] return best_splits