Exemplo n.º 1
0
def label_edits(pairs, args):
    annotator = errant.load("en")
    labels = []
    # Process each line of all input files
    for orig, cors in tqdm(pairs):
        label = []
        # Get the original and all the corrected texts
        orig = orig.strip()
        cors = [cors]
        # Skip the line if orig is empty
        if not orig: continue
        # Parse orig with spacy
        orig = annotator.parse(orig, args.tok)
        # Write orig to the output m2 file
        # Loop through the corrected texts
        for cor_id, cor in enumerate(cors):
            cor = cor.strip()
            # If the texts are the same, write a noop edit
            if orig.text.strip() == cor:
                label.append(noop_edit(cor_id).split('|||')[1])
            # Otherwise, do extra processing
            else:
                # Parse cor with spacy
                cor = annotator.parse(cor, args.tok)
                # Align the texts and extract and classify the edits
                edits = annotator.annotate(orig, cor, args.lev, args.merge)
                # Loop through the edits
                for edit in edits:
                    # Write the edit to the output m2 file
                    label.append(edit.to_m2(cor_id).split('|||')[1])
        # Write a newline when we have processed all corrections for each line
        labels.append(label)
    return labels
Exemplo n.º 2
0
def main():
    # Parse command line args
    args = parse_args()
    print("Loading resources...")
    # Load Errant
    annotator = errant.load("en")
    # Open output m2 file
    out_m2 = open(args.out, "w")

    print("Processing parallel files...")
    # Process an arbitrary number of files line by line simultaneously. Python 3.3+
    # See https://tinyurl.com/y4cj4gth
    with ExitStack() as stack:
        orig_lines = stack.enter_context(open(args.orig, encoding='utf-8')).readlines()
        cor_lines = stack.enter_context(open(args.cor[0], encoding='utf-8')).readlines()
        pairs = list(zip(orig_lines, cor_lines))
        batch_size = len(orig_lines) // args.n_procs
        splits = split(pairs, batch_size)
        partial_func = partial(label_edits, args=args)

        with Pool(args.n_procs) as pool:
            results = pool.map(partial_func, splits)
        labeled = merge(results)

        for label in tqdm(labeled):
            out_m2.write(','.join(label) + '\n')
Exemplo n.º 3
0
def main():
    # Parse command line args
    args = parse_args()
    print("Loading resources...")
    # Load Errant
    if args.lang == "English":
        annotator = errant.load("en")
    elif args.lang == "Russian":
        annotator = errant.load("ru")
    print("Processing parallel files...")
    # Process an arbitrary number of files line by line simultaneously. Python 3.3+
    # See https://tinyurl.com/y4cj4gth . Also opens the output m2 file.
    with ExitStack() as stack, open(args.out, "w") as out_m2:
        in_files = [
            stack.enter_context(open(i)) for i in [args.orig] + args.cor
        ]
        # Process each line of all input files
        for line in zip(*in_files):
            # Get the original and all the corrected texts
            orig = line[0].strip()
            cors = line[1:]
            # Skip the line if orig is empty
            if not orig:
                continue
            # Parse orig with spacy
            orig = annotator.parse(orig, args.tok)
            # Write orig to the output m2 file
            out_m2.write(" ".join(["S"] + [token.text
                                           for token in orig]) + "\n")
            # Loop through the corrected texts
            for cor_id, cor in enumerate(cors):
                cor = cor.strip()
                # If the texts are the same, write a noop edit
                if orig.text.strip() == cor:
                    out_m2.write(noop_edit(cor_id) + "\n")
                # Otherwise, do extra processing
                else:
                    # Parse cor with spacy
                    cor = annotator.parse(cor, args.tok)
                    # Align the texts and extract and classify the edits
                    edits = annotator.annotate(orig, cor, args.lev, args.merge)
                    # Loop through the edits
                    for edit in edits:
                        # Write the edit to the output m2 file
                        out_m2.write(edit.to_m2(cor_id) + "\n")
            # Write a newline when we have processed all corrections for each line
            out_m2.write("\n")
def get_action(s1, s2):
    annotator = errant.load('en')
    orig = annotator.parse(s1)
    cor = annotator.parse(s2)
    edits = annotator.annotate(orig, cor)
    for e in edits:
        if 'R:' in e.type:
            return 'replace'
        elif 'M:' in e.type:
            return 'insert'
        elif 'U:' in e.type:
            return 'remove'
Exemplo n.º 5
0
def main():
    # Parse command line args
    args = parse_args()
    print("Loading resources...")
    # Load Errant based on the language
    annotator = errant.load(args.lang)
    # Open output m2 file
    out_m2 = open(args.out, "w", encoding='utf-8')

    print("Processing parallel files...")
    # Process an arbitrary number of files line by line simultaneously. Python 3.3+
    # See https://tinyurl.com/y4cj4gth
    with ExitStack() as stack:
        in_files = [
            stack.enter_context(open(i)) for i in [args.orig] + args.cor
        ]
        # Process each line of all input files
        for line in zip(*in_files):
            # Get the original and all the corrected texts
            orig = line[0].strip()
            cors = line[1:]
            # Skip the line if orig is empty
            if not orig: continue
            # Parse orig to get a list of ParsedToken objects
            orig = annotator.parse(orig, args.tok)
            # Write orig to the output m2 file
            out_m2.write(" ".join(["S"] + [token.text
                                           for token in orig]) + "\n")
            # Loop through the corrected texts
            for cor_id, cor in enumerate(cors):
                cor = cor.strip()
                # If the texts are the same, write a noop edit
                if ' '.join(
                        o.text for o in orig
                ) == cor:  # replace orig.text.strip() to ' '.join(o.text for o in orig) to get the original text
                    out_m2.write(noop_edit(cor_id) + "\n")
                # Otherwise, do extra processing
                else:
                    # Parse cor to get an array of ParsedToken objects
                    cor = annotator.parse(cor, args.tok)
                    # Align the texts and extract and classify the edits
                    edits = annotator.annotate(orig, cor, args.lev, args.merge)
                    # Loop through the edits
                    for edit in edits:
                        # Write the edit to the output m2 file
                        out_m2.write(edit.to_m2(cor_id) + "\n")
            # Write a newline when we have processed all corrections for each line
            out_m2.write("\n")
def get_category(s1, s2):
    annotator = errant.load('en')
    orig = annotator.parse(s1)
    cor = annotator.parse(s2)
    edits = annotator.annotate(orig, cor)
    for e in edits:
        if 'DET' in e.type:
            return 'Articles'
        elif 'PREP' in e.type or 'PART' in e.type:
            return 'Preposition'
        elif 'PUNCT' in e.type or get_diff(e.o_str, e.c_str)[0] in punc:
            return 'Punctuation'
        elif 'VERB' in e.type or set(ing).issubset(
                set(get_diff(e.o_str, e.c_str))):
            if 'SVA' in e.type:
                return 'Subject Verb Agreement'
            else:
                return 'Verb Form'
        elif 'NOUN' in e.type or 'ADJ' in e.type or 'MORPH' in e.type or 'SPELL' in e.type or 'ORTH' in e.type:
            return 'Word Form'
        else:
            return 'Other'
def get_explanation(s1, s2):
    annotator = errant.load('en')
    orig = annotator.parse(s1)
    cor = annotator.parse(s2)
    edits = annotator.annotate(orig, cor)
    error = get_category(s1, s2)
    if error == 'Verb Form':
        for e in edits:
            if 'TENSE' in e.type:
                if 'R:' in e.type:
                    return f"Verb tense error, should replace '{e.o_str}' with '{e.c_str}'."
                elif 'M:' in e.type:
                    return f"Verb tense error, should insert '{e.c_str}'."
                else:
                    return f"Verb tense error, should remove '{e.o_str}'."
            elif 'FORM' in e.type:
                if 'R:' in e.type:
                    return f"Verb form error, should replace '{e.o_str}' with '{e.c_str}'."
                elif 'M:' in e.type:
                    return f"Verb form error, should insert '{e.c_str}'."
                else:
                    return f"Verb form error, should remove '{e.o_str}'."
            elif set(ing).issubset(set(get_diff(e.o_str, e.c_str))):
                if 'R:' in e.type:
                    return f"Present continuous tense, should replace '{e.o_str}' with '{e.c_str}'."
                elif 'M:' in e.type:
                    return f"Present continuous tense, should insert '{e.c_str}'."
                else:
                    return f"Present continuous tense, should remove '{e.o_str}'."
            else:
                if 'R:' in e.type:
                    return f"Other verb error, should replace '{e.o_str}' with '{e.c_str}'."
                elif 'M:' in e.type:
                    return f"Other verb error, should insert '{e.c_str}'."
                else:
                    return f"Other verb error, should remove '{e.o_str}'."
    elif error == 'Word Form':
        for e in edits:
            if 'NUM' in e.type:
                if 'R:' in e.type:
                    return f"Noun Number error, should replace '{e.o_str}' with '{e.c_str}'."
                elif 'M:' in e.type:
                    return f"Noun Number error, should insert '{e.c_str}'."
                else:
                    return f"Noun Number error, should remove '{e.o_str}'."
            elif 'ADJ' in e.type:
                if 'R:' in e.type:
                    return f"Adjective error, should replace '{e.o_str}' with '{e.c_str}'."
                elif 'M:' in e.type:
                    return f"Adjective error, should insert '{e.c_str}'."
                else:
                    return f"Adjective error, should remove '{e.o_str}'."
            elif 'MORPH' in e.type:
                if 'R:' in e.type:
                    return f"Morphology error, should replace '{e.o_str}' with '{e.c_str}'."
                elif 'M:' in e.type:
                    return f"Morphology error, should insert '{e.c_str}'."
                else:
                    return f"Morphology error, should remove '{e.o_str}'."
            elif 'ORTH' in e.type:
                if 'R:' in e.type:
                    return f"Orthography error, should replace '{e.o_str}' with '{e.c_str}'."
                elif 'M:' in e.type:
                    return f"Orthography error, should insert '{e.c_str}'."
                else:
                    return f"Orthography error, should remove '{e.o_str}'."
            else:
                if 'R:' in e.type:
                    return f"Other word form error, should replace '{e.o_str}' with '{e.c_str}'."
                elif 'M:' in e.type:
                    return f"Other word form error, should insert '{e.c_str}'."
                else:
                    return f"Other word form error, should remove '{e.o_str}'."
    elif error == 'Punctuation':
        for e in edits:
            if 'M:' in e.type:
                expla = f"Consider add punctuation '{e.c_str}' in your sentence."
                return expla
            if 'R:' in e.type:
                expla = f"Consider change the punctuation into '{e.c_str}'."
                return expla
            if 'U:' in e.type:
                expla = f"Please remove the unnecessary punctuation '{e.c_str}'."
                return expla
    elif error == 'Subject Verb Agreement':
        for e in edits:
            if True:
                expla = f"Please check the subject-verb agreement, choose the approate format for verb '{e.c_str}'."
                return expla
    elif error == 'Articles':
        for e in edits:
            if 'R:' in e.type:
                expla = f"Consider article '{e.c_str}' in front of countable or singular nouns referring to people or things what have not already been mentioned."
                return expla
            elif 'M:' in e.type:
                expla = f"Article '{e.c_str}' is required because of the countable or singular nouns referring to people or things what have not already been mentioned."
                return expla
            elif 'U:' in e.type:
                return 'No article required'
    elif error == 'Preposition':
        for e in edits:
            if 'R:' in e.type:
                expla = f"Consider '{e.c_str}' to be the proper preposition."
                return expla
            elif 'M:' in e.type:
                expla = f"You need a preposition '{e.c_str}'before a noun or pronoun to show place, position, time or method."
                return expla
            elif 'U:' in e.type:
                expla = f"You don't need preposition '{e.o_str}' here, consider to remove it."
                return expla
            else:
                return 'Others'
Exemplo n.º 8
0
def main():
    # Parse command line args
    args = parse_args()
    print("Loading resources...")
    # Load Errant
    annotator = errant.load("en")
    # Open output M2 file
    out_m2 = open(args.out, "w")

    print("Processing M2 file...")
    # Open the m2 file and split it into text+edit blocks
    m2 = open(args.m2_file).read().strip().split("\n\n")
    # Loop through the blocks
    for m2_block in m2:
        m2_block = m2_block.strip().split("\n")
        # Write the original text to the output M2 file
        out_m2.write(m2_block[0] + "\n")
        # Parse orig with spacy
        orig = annotator.parse(m2_block[0][2:])
        # Simplify the edits and sort by coder id
        edit_dict = simplify_edits(m2_block[1:])
        # Loop through coder ids
        for id, raw_edits in sorted(edit_dict.items()):
            # If the first edit is a noop
            if raw_edits[0][2] == "noop":
                # Write the noop and continue
                out_m2.write(noop_edit(id) + "\n")
                continue
            # Apply the edits to generate the corrected text
            # Also redefine the edits as orig and cor token offsets
            cor, gold_edits = get_cor_and_edits(m2_block[0][2:], raw_edits)
            # Parse cor with spacy
            cor = annotator.parse(cor)
            # Save detection edits here for auto
            det_edits = []
            # Loop through the gold edits
            for gold_edit in gold_edits:
                # Do not minimise detection edits
                if gold_edit[-2] in {"Um", "UNK"}:
                    edit = annotator.import_edit(orig,
                                                 cor,
                                                 gold_edit[:-1],
                                                 min=False,
                                                 old_cat=args.old_cats)
                    # Overwrite the pseudo correction and set it in the edit
                    edit.c_toks = annotator.parse(gold_edit[-1])
                    # Save the edit for auto
                    det_edits.append(edit)
                    # Write the edit for gold
                    if args.gold:
                        # Write the edit
                        out_m2.write(edit.to_m2(id) + "\n")
                # Gold annotation
                elif args.gold:
                    edit = annotator.import_edit(orig, cor, gold_edit[:-1],
                                                 not args.no_min,
                                                 args.old_cats)
                    # Write the edit
                    out_m2.write(edit.to_m2(id) + "\n")
            # Auto annotations
            if args.auto:
                # Auto edits
                edits = annotator.annotate(orig, cor, args.lev, args.merge)
                # Combine detection and auto edits and sort by orig offsets
                edits = sorted(det_edits + edits,
                               key=lambda e: (e.o_start, e.o_end))
                # Write the edits to the output M2 file
                for edit in edits:
                    out_m2.write(edit.to_m2(id) + "\n")
        # Write a newline when there are no more edits
        out_m2.write("\n")
Exemplo n.º 9
0
import errant
#
# annotator = errant.load('en')
# orig = annotator.parse('This are gramamtical sentence .')
# cor = annotator.parse('This is a grammatical sentence .')
# edit = [1, 2, 1, 2, 'SVA'] # are -> is
# edit = annotator.import_edit(orig, cor, edit)
# print(edit.to_m2())

annotator = errant.load('en')
orig = annotator.parse('This are gramamtical sentence .')
cor = annotator.parse('This is a grammatical sentence .')
alignment = annotator.align(orig, cor)
edits = annotator.merge(alignment)
for e in edits:
    e = annotator.classify(e)
    print(e)
Exemplo n.º 10
0
def main():
    # Parse command line args
    args = parse_args()
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")
    # Load Errant
    annotator = errant.load("en", nlp)
    # Punctuation normalisation dictionary
    norm_dict = {
        "’": "'",
        "´": "'",
        "‘": "'",
        "′": "'",
        "`": "'",
        '“': '"',
        '”': '"',
        '˝': '"',
        '¨': '"',
        '„': '"',
        '『': '"',
        '』': '"',
        '–': '-',
        '—': '-',
        '―': '-',
        '¬': '-',
        '、': ',',
        ',': ',',
        ':': ':',
        ';': ';',
        '?': '?',
        '!': '!',
        'ِ': ' ',
        '\u200b': ' '
    }
    norm_dict = {ord(k): v for k, v in norm_dict.items()}
    # Open output M2 file
    out_m2 = open(args.out, "w")

    print("Preprocessing files...")
    # Open the file
    with open(args.json_file) as data:
        # Process each line
        for line in data:
            # Load the JSON line
            line = json.loads(line)
            # Normalise certain punctuation in the text
            text = line["text"].translate(norm_dict)
            # Store the sentences and edits for all annotators here
            coder_dict = {}
            # Loop through the annotator ids and their edits
            for coder, edits in line["edits"]:
                # Add the coder to the coder_dict if needed
                if coder not in coder_dict: coder_dict[coder] = []
                # Split the essay into paras and update and normalise the char edits
                para_info = get_paras(text, edits, norm_dict)
                # Loop through the paras and edits
                for orig_para, para_edits in para_info:
                    # Remove unnecessary whitespace from para and update char edits
                    orig_para, para_edits = clean_para(orig_para, para_edits)
                    if not orig_para: continue  # Ignore empty paras
                    # Convert character edits to token edits based on spacy tokenisation
                    orig_para = nlp(orig_para)
                    para_edits = get_token_edits(orig_para, para_edits, nlp)
                    # Split the paragraph into sentences, if needed, and update tok edits
                    sents = get_sents(orig_para,
                                      para_edits,
                                      sent_tokenised=True)
                    # Save the sents in the coder_dict
                    coder_dict[coder].extend(sents)
            # Document level M2 file. Merge the text as a single long string
            if args.docs: coder_dict = doc_m2(coder_dict)
            # Get the sorted coder ids
            coder_ids = sorted(coder_dict.keys())
            # Loop through the sentences for the first coder
            for sent_id, sent in enumerate(coder_dict[0]):
                # Write the original sentence to the output M2 file
                out_m2.write("S " + " ".join(sent["orig"]) + "\n")
                # Annotate the original sentence with spacy
                orig = annotator.parse(" ".join(sent["orig"]))
                # Loop through the coders
                for id in coder_ids:
                    # Annotate the corrected sentence with spacy and get the gold edits
                    cor = annotator.parse(" ".join(
                        coder_dict[id][sent_id]["cor"]))
                    gold_edits = coder_dict[id][sent_id]["edits"]
                    # Gold edits
                    if args.gold:
                        # Make sure edits are ordered by orig start and end offsets.
                        gold_edits = sorted(gold_edits,
                                            key=itemgetter(0))  # Start
                        gold_edits = sorted(gold_edits,
                                            key=itemgetter(1))  # End
                        proc_edits = []
                        # Loop through the gold edits.
                        for gold_edit in gold_edits:
                            # Format the edit for errant import
                            gold_edit = gold_edit[:2] + gold_edit[-2:] + [
                                gold_edit[2]
                            ]
                            # Detection edits (never minimised)
                            if gold_edit[-1] == "D":
                                gold_edit = annotator.import_edit(
                                    orig,
                                    cor,
                                    gold_edit,
                                    min=False,
                                    old_cat=args.old_cats)
                            # Correction edits
                            else:
                                gold_edit = annotator.import_edit(
                                    orig, cor, gold_edit, not args.no_min,
                                    args.old_cats)
                                # Ignore edits that have been minimised to nothing
                                if gold_edit.o_start == gold_edit.o_end and \
                                    not gold_edit.c_str:
                                    continue
                            # Save the edit in proc edits
                            proc_edits.append(gold_edit)
                        # If there are no edits, write an explicit noop edit.
                        if not proc_edits:
                            out_m2.write(noop_edit(id) + "\n")
                        # Write the edits to the output M2 file
                        for edit in proc_edits:
                            out_m2.write(edit.to_m2(id) + "\n")
                    # Auto edits
                    elif args.auto:
                        auto_edits = annotator.annotate(
                            orig, cor, args.lev, args.merge)
                        # If there are no edits, write an explicit noop edit.
                        if not auto_edits:
                            out_m2.write(noop_edit(id) + "\n")
                        # Write the edits to the output M2 file
                        for edit in auto_edits:
                            out_m2.write(edit.to_m2(id) + "\n")
                # Write new line after each sentence when we reach last coder.
                out_m2.write("\n")
Exemplo n.º 11
0
import logging
import requests
import tornado.web
from time import time
from gector.gec_model import GecBERTModel
from utils.helpers import add_sents_idx, add_tokens_idx, token_level_edits, forward_merge_corrections, backward_merge_corrections
from copy import deepcopy
import pprint
import errant

logging.basicConfig(
    format='%(levelname)s: [%(asctime)s][%(filename)s:%(lineno)d] %(message)s',
    level=logging.INFO)

nlp = spacy.load("en")
annotator = errant.load(lang='en', nlp=nlp)

model = GecBERTModel(
    vocab_path="./data/output_vocabulary",
    model_paths=["./pretrain/roberta_1_gector.th"],
    # model_paths = ["./pretrain/bert_0_gector.th", "./pretrain/roberta_1_gector.th", "./pretrain/xlnet_0_gector.th"],
    model_name="roberta",
    is_ensemble=False,
    iterations=3,
)

DEFAULT_CONFIG = {
    'iterations': 3,
    'min_probability': 0.5,
    'min_error_probability': 0.7,
    'case_sensitive': True,