def wrap_para(t_para, i_para, logger): t_tokens, i_tokens = t_para[:], i_para[:] o_tokens = [] linebreak_to_space(i_tokens) matches = build_match_list(t_tokens, i_tokens) #handle shards before first match if not matches: return i_para t_shard = t_tokens[:matches[0][0]] i_shard = i_tokens[:matches[0][1]] if t_shard or i_shard: i_shard = i_tokens[0:matches[0][1]] o_tokens = merge_breaks(t_shard, i_shard, logger) for start, end in zip(matches, matches[1:] + [[len(t_tokens), len(i_tokens)]]): o_tokens.append(i_tokens[start[1]]) t_shard = t_tokens[start[0] + 1:end[0]] if t_shard: i_shard = i_tokens[start[1] + 1:end[1]] o_tokens += merge_breaks(t_shard, i_shard, logger) if o_tokens[-1][1] & tokenise.TYPE_SPACE: logger.message("Warning: Additional material after final linebreak", t_shard, i_shard) o_tokens[-1] = ("\n", tokenise.TYPE_LINEBREAK | tokenise.TYPE_PARABREAK) return o_tokens
def merge_breaks(t_shard, i_shard, logger): #if there are no line breaks in the shard, just return the i_shard if not [X for X in t_shard if X[1] & tokenise.TYPE_LINEBREAK]: return i_shard #so, there are some linebreaks... nt_shard = t_shard[:] linebreak_to_space(nt_shard) if i_shard == nt_shard: #simple case -- identical shards if you replace the linebreaks with spaces for c, t in enumerate(t_shard): if t[1] & tokenise.TYPE_LINEBREAK: i_shard[c] = t else: #bring in the big guns! nt_seq = tuple([tuple(X) for X in nt_shard]) i_seq = tuple([tuple(X) for X in i_shard]) sm = difflib.SequenceMatcher(None, nt_seq, i_seq, False) mb = sm.get_matching_blocks() if len(mb) == 1: #no match in shards logger.message("Warning: No matches", t_shard, i_shard) return i_shard for ct, t in enumerate(t_shard): if t[1] & tokenise.TYPE_LINEBREAK: #find the match block after the linebreak for cm, m in enumerate(mb): if m[0] > ct: break if cm == 0: if ct == 0 and (i_shard[0][1] & tokenise.TYPE_SPACE): i_shard[0] = t else: logger.message("Warning: No candidate space", t_shard, i_shard) elif mb[cm - 1][0] + mb[cm - 1][2] > ct: #break occured inside prev matchblock i_shard[ct + mb[cm - 1][1] - mb[cm - 1][0]] = t elif mb[cm][0] - 1 == ct: #break occurred just prior to matchblock candidate = mb[cm][1] - 1 if i_shard[candidate][1] & tokenise.TYPE_SPACE: i_shard[candidate] = t elif i_shard[candidate][0] == "-": #line is broken at hyphen i_shard[candidate] = t i_shard[candidate][0] = "-" + i_shard[candidate][0] else: logger.message("Warning: No candidate space", t_shard, i_shard) else: #match occurred outside matchblock logger.message("Warning: Break outside match block", t_shard, i_shard) return i_shard
def merge_breaks(t_shard, i_shard, logger): # if there are no line breaks in the shard, just return the i_shard if not [X for X in t_shard if X[1] & tokenise.TYPE_LINEBREAK]: return i_shard # so, there are some linebreaks... nt_shard = t_shard[:] linebreak_to_space(nt_shard) if i_shard == nt_shard: # simple case -- identical shards if you replace the linebreaks with spaces for c, t in enumerate(t_shard): if t[1] & tokenise.TYPE_LINEBREAK: i_shard[c] = t else: # bring in the big guns! nt_seq = tuple([tuple(X) for X in nt_shard]) i_seq = tuple([tuple(X) for X in i_shard]) sm = difflib.SequenceMatcher(None, nt_seq, i_seq, False) mb = sm.get_matching_blocks() if len(mb) == 1: # no match in shards logger.message("Warning: No matches", t_shard, i_shard) return i_shard for ct, t in enumerate(t_shard): if t[1] & tokenise.TYPE_LINEBREAK: # find the match block after the linebreak for cm, m in enumerate(mb): if m[0] > ct: break if cm == 0: if ct == 0 and (i_shard[0][1] & tokenise.TYPE_SPACE): i_shard[0] = t else: logger.message("Warning: No candidate space", t_shard, i_shard) elif mb[cm - 1][0] + mb[cm - 1][2] > ct: # break occured inside prev matchblock i_shard[ct + mb[cm - 1][1] - mb[cm - 1][0]] = t elif mb[cm][0] - 1 == ct: # break occurred just prior to matchblock candidate = mb[cm][1] - 1 if i_shard[candidate][1] & tokenise.TYPE_SPACE: i_shard[candidate] = t elif i_shard[candidate][0] == "-": # line is broken at hyphen i_shard[candidate] = t i_shard[candidate][0] = "-" + i_shard[candidate][0] else: logger.message("Warning: No candidate space", t_shard, i_shard) else: # match occurred outside matchblock logger.message("Warning: Break outside match block", t_shard, i_shard) return i_shard
def break_para(t_paras, i_para, logger): """Breaks i_para into a list of paras that approximately match those in the t_paras list""" #normalise t_paras t_tokens = join_paras(t_paras)[1:] linebreak_to_space(t_tokens) #normalise i_para i_tokens = i_para[1:] i_oldtokens = i_tokens[:] linebreak_to_space(i_tokens) #insert parabreak tokens into t_tokens c = 0 for p in t_paras: c += len(p) - 1 assert t_tokens[c - 1][1] & tokenise.TYPE_SPACE t_tokens[c - 1] = ["\n", tokenise.TYPE_LINEBREAK | tokenise.TYPE_PARABREAK] #wrap i_tokens using t_tokens i_tokens = wrap.wrap_para(t_tokens, i_tokens, logger) #copy old linebreaks back into i_tokens for c, t in enumerate(i_oldtokens): if (t[1] & tokenise.TYPE_LINEBREAK) and (i_tokens[c][1] & tokenise.TYPE_SPACE): i_tokens[c] = t return split_and_sign_paras(i_tokens)
def wrap_para(t_para, i_para, logger): t_tokens, i_tokens = t_para[:], i_para[:] o_tokens = [] linebreak_to_space(i_tokens) matches = build_match_list(t_tokens, i_tokens) # handle shards before first match if not matches: return i_para t_shard = t_tokens[: matches[0][0]] i_shard = i_tokens[: matches[0][1]] if t_shard or i_shard: i_shard = i_tokens[0 : matches[0][1]] o_tokens = merge_breaks(t_shard, i_shard, logger) for start, end in zip(matches, matches[1:] + [[len(t_tokens), len(i_tokens)]]): o_tokens.append(i_tokens[start[1]]) t_shard = t_tokens[start[0] + 1 : end[0]] if t_shard: i_shard = i_tokens[start[1] + 1 : end[1]] o_tokens += merge_breaks(t_shard, i_shard, logger) if o_tokens[-1][1] & tokenise.TYPE_SPACE: logger.message("Warning: Additional material after final linebreak", t_shard, i_shard) o_tokens[-1] = ("\n", tokenise.TYPE_LINEBREAK | tokenise.TYPE_PARABREAK) return o_tokens