示例#1
0
def get_ins_dels(incorr_line, correct_line):
    ins = defaultdict(int)
    dels = defaultdict(int)
    rejected = 0

    incorr_tokens = custom_tokenize(incorr_line, wordpiece_tokenizer, mode="train")
    correct_tokens = custom_tokenize(correct_line, wordpiece_tokenizer, mode="train")
    diffs = seq2edits_utils.ndiff(incorr_tokens, correct_tokens)

    for item in diffs:
        if item[0]=="+":
            if len(item[2:].split())>2:
                return defaultdict(int), defaultdict(int), 1
            ins[item[2:]]+=1
        elif item[0]=="-":
            dels[item[2:]]+=1

    return ins,dels,0
示例#2
0
def get_tuple(line):

    if FLAGS.do_spell_check:
        line = line.strip().split()
        line = wordpiece_tokenizer.basic_tokenizer._run_spell_check(line)
        line = " ".join(line)
    tokens = custom_tokenize(line, wordpiece_tokenizer)
    token_ids = wordpiece_tokenizer.convert_tokens_to_ids(tokens)
    #print(tokens)
    #print(token_ids)
    return tokens, token_ids
示例#3
0
def seq2edits(incorr_line, correct_line):
    # Seq2Edits function (Described in Section 2.2 of the paper)
    # obtains edit ids from incorrect and correct tokens
    # input: incorrect line and correct line
    # output: incorr_tokens, correct_tokens,  incorr token ids, edit ids

    # tokenize incorr_line and correct_line
    incorr_tokens = custom_tokenize(incorr_line,
                                    wordpiece_tokenizer,
                                    mode="train")
    correct_tokens = custom_tokenize(correct_line,
                                     wordpiece_tokenizer,
                                     mode="train")
    # generate diffs using modified edit distance algorith
    # (Described in Appendix A.1 of the paper)
    diffs = seq2edits_utils.ndiff(incorr_tokens, correct_tokens)
    # align diffs to get edits
    edit_ids = diffs_to_edits(diffs)

    if not edit_ids:
        return None
    # get incorrect token ids
    incorr_tok_ids = wordpiece_tokenizer.convert_tokens_to_ids(incorr_tokens)
    return incorr_tokens, correct_tokens, incorr_tok_ids, edit_ids
示例#4
0
def get_tuple(wordpiece_tokenizer, line, do_spell_check):
    """
    进行tokenize.

    Args:
        wordpiece_tokenizer: class, tokenizer
        line: string, 输入文本
        do_spell_check: boolean, 是否进行autocorrect的拼写检查

    Return:
        tokens, token_ids

    """
    if do_spell_check:
        line = line.strip().split()
        line = wordpiece_tokenizer.basic_tokenizer._run_spell_check(line)
        line = " ".join(line)
    tokens = custom_tokenize(line, wordpiece_tokenizer)
    token_ids = wordpiece_tokenizer.convert_tokens_to_ids(tokens)
    return tokens, token_ids