示例#1
0
文件: wrap.py 项目: JonHurst/rewrap
def wrap_para(t_para, i_para, logger):
    t_tokens, i_tokens = t_para[:], i_para[:]
    o_tokens = []
    linebreak_to_space(i_tokens)
    matches = build_match_list(t_tokens, i_tokens)
    #handle shards before first match
    if not matches: return i_para
    t_shard = t_tokens[:matches[0][0]]
    i_shard = i_tokens[:matches[0][1]]
    if t_shard or i_shard:
        i_shard = i_tokens[0:matches[0][1]]
        o_tokens = merge_breaks(t_shard, i_shard, logger)
    for start, end in zip(matches, matches[1:] +
                          [[len(t_tokens), len(i_tokens)]]):
        o_tokens.append(i_tokens[start[1]])
        t_shard = t_tokens[start[0] + 1:end[0]]
        if t_shard:
            i_shard = i_tokens[start[1] + 1:end[1]]
            o_tokens += merge_breaks(t_shard, i_shard, logger)
    if o_tokens[-1][1] & tokenise.TYPE_SPACE:
        logger.message("Warning: Additional material after final linebreak",
                       t_shard, i_shard)
        o_tokens[-1] = ("\n",
                        tokenise.TYPE_LINEBREAK | tokenise.TYPE_PARABREAK)
    return o_tokens
示例#2
0
文件: wrap.py 项目: JonHurst/rewrap
def merge_breaks(t_shard, i_shard, logger):
    #if there are no line breaks in the shard, just return the i_shard
    if not [X for X in t_shard if X[1] & tokenise.TYPE_LINEBREAK]:
        return i_shard
    #so, there are some linebreaks...
    nt_shard = t_shard[:]
    linebreak_to_space(nt_shard)
    if i_shard == nt_shard:
        #simple case -- identical shards if you replace the linebreaks with spaces
        for c, t in enumerate(t_shard):
            if t[1] & tokenise.TYPE_LINEBREAK:
                i_shard[c] = t
    else:
        #bring in the big guns!
        nt_seq = tuple([tuple(X) for X in nt_shard])
        i_seq = tuple([tuple(X) for X in i_shard])
        sm = difflib.SequenceMatcher(None, nt_seq, i_seq, False)
        mb = sm.get_matching_blocks()
        if len(mb) == 1:
            #no match in shards
            logger.message("Warning: No matches", t_shard, i_shard)
            return i_shard
        for ct, t in enumerate(t_shard):
            if t[1] & tokenise.TYPE_LINEBREAK:
                #find the match block after the linebreak
                for cm, m in enumerate(mb):
                    if m[0] > ct: break
                if cm == 0:
                    if ct == 0 and (i_shard[0][1] & tokenise.TYPE_SPACE):
                        i_shard[0] = t
                    else:
                        logger.message("Warning: No candidate space", t_shard,
                                       i_shard)
                elif mb[cm - 1][0] + mb[cm - 1][2] > ct:
                    #break occured inside prev matchblock
                    i_shard[ct + mb[cm - 1][1] - mb[cm - 1][0]] = t
                elif mb[cm][0] - 1 == ct:
                    #break occurred just prior to matchblock
                    candidate = mb[cm][1] - 1
                    if i_shard[candidate][1] & tokenise.TYPE_SPACE:
                        i_shard[candidate] = t
                    elif i_shard[candidate][0] == "-":
                        #line is broken at hyphen
                        i_shard[candidate] = t
                        i_shard[candidate][0] = "-" + i_shard[candidate][0]
                    else:
                        logger.message("Warning: No candidate space", t_shard,
                                       i_shard)
                else:
                    #match occurred outside matchblock
                    logger.message("Warning: Break outside match block",
                                   t_shard, i_shard)
    return i_shard
示例#3
0
文件: wrap.py 项目: JonHurst/rewrap
def merge_breaks(t_shard, i_shard, logger):
    # if there are no line breaks in the shard, just return the i_shard
    if not [X for X in t_shard if X[1] & tokenise.TYPE_LINEBREAK]:
        return i_shard
    # so, there are some linebreaks...
    nt_shard = t_shard[:]
    linebreak_to_space(nt_shard)
    if i_shard == nt_shard:
        # simple case -- identical shards if you replace the linebreaks with spaces
        for c, t in enumerate(t_shard):
            if t[1] & tokenise.TYPE_LINEBREAK:
                i_shard[c] = t
    else:
        # bring in the big guns!
        nt_seq = tuple([tuple(X) for X in nt_shard])
        i_seq = tuple([tuple(X) for X in i_shard])
        sm = difflib.SequenceMatcher(None, nt_seq, i_seq, False)
        mb = sm.get_matching_blocks()
        if len(mb) == 1:
            # no match in shards
            logger.message("Warning: No matches", t_shard, i_shard)
            return i_shard
        for ct, t in enumerate(t_shard):
            if t[1] & tokenise.TYPE_LINEBREAK:
                # find the match block after the linebreak
                for cm, m in enumerate(mb):
                    if m[0] > ct:
                        break
                if cm == 0:
                    if ct == 0 and (i_shard[0][1] & tokenise.TYPE_SPACE):
                        i_shard[0] = t
                    else:
                        logger.message("Warning: No candidate space", t_shard, i_shard)
                elif mb[cm - 1][0] + mb[cm - 1][2] > ct:
                    # break occured inside prev matchblock
                    i_shard[ct + mb[cm - 1][1] - mb[cm - 1][0]] = t
                elif mb[cm][0] - 1 == ct:
                    # break occurred just prior to matchblock
                    candidate = mb[cm][1] - 1
                    if i_shard[candidate][1] & tokenise.TYPE_SPACE:
                        i_shard[candidate] = t
                    elif i_shard[candidate][0] == "-":
                        # line is broken at hyphen
                        i_shard[candidate] = t
                        i_shard[candidate][0] = "-" + i_shard[candidate][0]
                    else:
                        logger.message("Warning: No candidate space", t_shard, i_shard)
                else:
                    # match occurred outside matchblock
                    logger.message("Warning: Break outside match block", t_shard, i_shard)
    return i_shard
示例#4
0
def break_para(t_paras, i_para, logger):
    """Breaks i_para into a list of paras that approximately match those in the t_paras list"""
    #normalise t_paras
    t_tokens = join_paras(t_paras)[1:]
    linebreak_to_space(t_tokens)
    #normalise i_para
    i_tokens = i_para[1:]
    i_oldtokens = i_tokens[:]
    linebreak_to_space(i_tokens)
    #insert parabreak tokens into t_tokens
    c = 0
    for p in t_paras:
        c += len(p) - 1
        assert t_tokens[c - 1][1] & tokenise.TYPE_SPACE
        t_tokens[c - 1] = ["\n", tokenise.TYPE_LINEBREAK | tokenise.TYPE_PARABREAK]
    #wrap i_tokens using t_tokens
    i_tokens = wrap.wrap_para(t_tokens, i_tokens, logger)
    #copy old linebreaks back into i_tokens
    for c, t in enumerate(i_oldtokens):
        if (t[1] & tokenise.TYPE_LINEBREAK) and (i_tokens[c][1] & tokenise.TYPE_SPACE):
            i_tokens[c] = t
    return split_and_sign_paras(i_tokens)
示例#5
0
文件: wrap.py 项目: JonHurst/rewrap
def wrap_para(t_para, i_para, logger):
    t_tokens, i_tokens = t_para[:], i_para[:]
    o_tokens = []
    linebreak_to_space(i_tokens)
    matches = build_match_list(t_tokens, i_tokens)
    # handle shards before first match
    if not matches:
        return i_para
    t_shard = t_tokens[: matches[0][0]]
    i_shard = i_tokens[: matches[0][1]]
    if t_shard or i_shard:
        i_shard = i_tokens[0 : matches[0][1]]
        o_tokens = merge_breaks(t_shard, i_shard, logger)
    for start, end in zip(matches, matches[1:] + [[len(t_tokens), len(i_tokens)]]):
        o_tokens.append(i_tokens[start[1]])
        t_shard = t_tokens[start[0] + 1 : end[0]]
        if t_shard:
            i_shard = i_tokens[start[1] + 1 : end[1]]
            o_tokens += merge_breaks(t_shard, i_shard, logger)
    if o_tokens[-1][1] & tokenise.TYPE_SPACE:
        logger.message("Warning: Additional material after final linebreak", t_shard, i_shard)
        o_tokens[-1] = ("\n", tokenise.TYPE_LINEBREAK | tokenise.TYPE_PARABREAK)
    return o_tokens