示例#1
0
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")

    # ExitStack lets us process an arbitrary number of files line by line simultaneously.
    # See https://stackoverflow.com/questions/24108769/how-to-read-and-process-multiple-files-simultaneously-in-python
    print("Processing files...")
    with ExitStack() as stack:
        in_files = [
            stack.enter_context(open(i)) for i in [args.orig] + args.cor
        ]
        # Process each line of all input files.
        for line_id, line in enumerate(zip(*in_files)):
            orig_sent = line[0].strip()
            cor_sents = line[1:]
            # If orig sent is empty, skip the line
            if not orig_sent: continue
            # Write the original sentence to the output m2 file.
            out_m2.write("S " + orig_sent + "\n")
            # Markup the original sentence with spacy (assume tokenized)
            proc_orig = toolbox.applySpacy(orig_sent.split(), nlp)
            # Loop through the corrected sentences
            for cor_id, cor_sent in enumerate(cor_sents):
                cor_sent = cor_sent.strip()
                # Identical sentences have no edits, so just write noop.
                if orig_sent == cor_sent:
                    out_m2.write(
                        "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||" +
                        str(cor_id) + "\n")
                # Otherwise, do extra processing.
                else:
                    # Markup the corrected sentence with spacy (assume tokenized)
                    proc_cor = toolbox.applySpacy(cor_sent.strip().split(),
                                                  nlp)
                    # Auto align the parallel sentences and extract the edits.
                    auto_edits = align_text.getAutoAlignedEdits(
                        proc_orig, proc_cor, nlp, args)
                    # Loop through the edits.
                    for auto_edit in auto_edits:
                        # Give each edit an automatic error type.
                        cat = cat_rules.autoTypeEdit(auto_edit, proc_orig,
                                                     proc_cor, gb_spell,
                                                     tag_map, nlp, stemmer)
                        auto_edit[2] = cat
                        # Write the edit to the output m2 file.
                        out_m2.write(
                            toolbox.formatEdit(auto_edit, cor_id) + "\n")
            # Write a newline when we have processed all corrections for a given sentence.
            out_m2.write("\n")
示例#2
0
def _generate_m2(orig_sent, cor_sent):
    ignore_count = 0
    out_m2_str = ''
    # Process each pre-aligned sentence pair.
    try:
        # Check sentence length:
        if len(orig_sent.strip().split()) < 3:
            raise Exception('Source sentence is too short.')
        if len(cor_sent.strip().split()) < 3:
            raise Exception('Target sentence is too short.')

        # Detokenize sents if they're pre-tokenized. Otherwise the result will be wrong.
        if args.is_tokenized_orig:
            orig_sent = detokenizer.detokenize(orig_sent.strip().split(),
                                               return_str=True)
        if args.is_tokenized_cor:
            cor_sent = detokenizer.detokenize(cor_sent.strip().split(),
                                              return_str=True)
        # Markup the parallel sentences with spacy (assume tokenized)
        proc_orig = toolbox.applySpacy(orig_sent.strip(), nlp)
        proc_cor = toolbox.applySpacy(cor_sent.strip(), nlp)
        # Write the original sentence to the output m2 file.
        out_m2_str += "S " + toolbox.formatProcSent(
            proc_orig, feature_delimiter=args.feature_delimiter) + "\n"
        out_m2_str += "T " + toolbox.formatProcSent(
            proc_cor, feature_delimiter=args.feature_delimiter) + "\n"
        # out_m2.write("S " + toolbox.formatProcSent(proc_orig, feature_delimiter=args.feature_delimiter) + "\n")
        # out_m2.write("T " + toolbox.formatProcSent(proc_cor, feature_delimiter=args.feature_delimiter) + "\n")
        # Identical sentences have no edits, so just write noop.
        if orig_sent.strip() == cor_sent.strip():
            out_m2_str += "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n"
            # out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
        # Otherwise, do extra processing.
        else:
            # Auto align the parallel sentences and extract the edits.
            auto_edits = align_text.getAutoAlignedEdits(
                proc_orig, proc_cor, nlp, args)
            # Loop through the edits.
            for auto_edit in auto_edits:
                # Give each edit an automatic error type.
                cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor,
                                             gb_spell, tag_map, nlp, stemmer)
                auto_edit[2] = cat
                # Write the edit to the output m2 file.
                out_m2_str += toolbox.formatEdit(auto_edit) + "\n"
                # out_m2.write(toolbox.formatEdit(auto_edit)+"\n")
        # Write a newline when there are no more edits.
        out_m2_str += "\n"
        # out_m2.write("\n")
    except KeyboardInterrupt:
        sys.exit(1)
    except:
        ignore_count += 1
        print('\nIgnore example:')
        print('- Source: ', orig_sent)
        print('- Target: ', cor_sent)
        print()

    return out_m2_str, ignore_count
示例#3
0
def main(args):
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")

    # Setup output m2 file based on corrected file name.
    m2_out = open(args.out if args.out.endswith(".m2") else args.out + ".m2",
                  "w")

    print("Processing files...")
    with io.open(args.orig,
                 encoding='utf-8') as orig, io.open(args.cor,
                                                    encoding='utf-8') as cor:
        # Process each pre-aligned sentence pair.
        for orig_sent, cor_sent in zip(orig, cor):
            # Get the raw text.
            orig_sent = orig_sent.strip()
            cor_sent = cor_sent.strip()
            # Ignore empty sentences
            if not orig_sent and not cor_sent: continue
            # If args.tok, we also need to tokenise the text.
            if args.tok:
                orig_sent = nlp(orig_sent, tag=True, parse=True, entity=False)
                cor_sent = nlp(cor_sent, tag=True, parse=True, entity=False)
            # Otherwise, assume it is tokenized and then process.
            else:
                orig_sent = nlp.tokenizer.tokens_from_list(orig_sent.split())
                cor_sent = nlp.tokenizer.tokens_from_list(cor_sent.split())
                nlp.tagger(orig_sent)
                nlp.tagger(cor_sent)
                nlp.parser(orig_sent)
                nlp.parser(cor_sent)
            # Get a list of string toks for each.
            orig_toks = [tok.orth_ for tok in orig_sent]
            cor_toks = [tok.orth_ for tok in cor_sent]
            # Auto align the sentence and extract the edits.
            auto_edits = align_text.getAutoAlignedEdits(
                orig_toks, cor_toks, orig_sent, cor_sent, nlp, args.lev,
                args.merge)
            # Write orig_toks to output.
            m2_out.write("S " + " ".join(orig_toks) + "\n")
            # If there are no edits, write an explicit dummy edit.
            if not auto_edits:
                m2_out.write("A -1 -1|||noop||||||REQUIRED|||-NONE-|||0\n")
            # Write the auto edits to the file.
            for auto_edit in auto_edits:
                # Write the edit to output.
                m2_out.write(formatEdit(auto_edit) + "\n")
            # Write new line after each sentence.
            m2_out.write("\n")
示例#4
0
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")

    print("Processing files...")
    # Open the original and corrected text files.
    with open(args.orig) as orig, open(args.cor) as cor:
        # Process each pre-aligned sentence pair.
        for orig_sent, cor_sent in zip(orig, cor):
            # Write the original sentence to the output m2 file.
            out_m2.write("S " + orig_sent)
            # Identical sentences have no edits, so just write noop.
            if orig_sent.strip() == cor_sent.strip():
                out_m2.write(
                    "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
            # Otherwise, do extra processing.
            else:
                # Markup the parallel sentences with spacy (assume tokenized)
                proc_orig = toolbox.applySpacy(orig_sent.strip().split(), nlp)
                proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp)
                # Auto align the parallel sentences and extract the edits.
                auto_edits = align_text.getAutoAlignedEdits(
                    proc_orig, proc_cor, nlp, args)
                # Loop through the edits.
                for auto_edit in auto_edits:
                    # Give each edit an automatic error type.
                    cat = cat_rules.autoTypeEdit(auto_edit, proc_orig,
                                                 proc_cor, gb_spell, tag_map,
                                                 nlp, stemmer)
                    auto_edit[2] = cat
                    # Write the edit to the output m2 file.
                    out_m2.write(toolbox.formatEdit(auto_edit) + "\n")
            # Write a newline when there are no more edits.
            out_m2.write("\n")
示例#5
0
def main(args):
	# Get base working directory.
	basename = os.path.dirname(os.path.realpath(__file__))
	print("Loading resources...")
	# Load Tokenizer and other resources
	nlp = spacy.load("en")
	# Lancaster Stemmer
	stemmer = LancasterStemmer()
	# GB English word list (inc -ise and -ize)
	gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
	# Part of speech map file
	tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")	
	# Setup output m2 file
	out_m2 = open(args.out, "w")

	print("Processing files...")
	# Open the m2 file and split into sentence+edit chunks.
	m2_file = open(args.m2).read().strip().split("\n\n")
	for info in m2_file:
		# Get the original and corrected sentence + edits for each annotator.
		orig_sent, coder_dict = toolbox.processM2(info)
		# Write the orig_sent to the output m2 file.
		out_m2.write("S "+" ".join(orig_sent)+"\n")
		# Only process sentences with edits.
		if coder_dict:
			# Save marked up original sentence here, if required.
			proc_orig = ""
			# Loop through the annotators
			for coder, coder_info in sorted(coder_dict.items()):
				cor_sent = coder_info[0]
				gold_edits = coder_info[1]
				# If there is only 1 edit and it is noop, just write it.
				if gold_edits[0][2] == "noop":
					out_m2.write(toolbox.formatEdit(gold_edits[0], coder)+"\n")				
					continue
				# Markup the orig and cor sentence with spacy (assume tokenized)
				# Orig is marked up only once for the first coder that needs it.
				proc_orig = toolbox.applySpacy(orig_sent, nlp) if not proc_orig else proc_orig
				proc_cor = toolbox.applySpacy(cor_sent, nlp)
				# Loop through gold edits.
				for gold_edit in gold_edits:
					# Um and UNK edits (uncorrected errors) are always preserved.
					if gold_edit[2] in {"Um", "UNK"}:
						# Um should get changed to UNK unless using old categories.
						if gold_edit[2] == "Um" and not args.old_cats: gold_edit[2] = "UNK"
						out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")				
					# Gold edits
					elif args.gold:
						# Minimise the edit; e.g. [has eaten -> was eaten] = [has -> was]
						if not args.max_edits:
							gold_edit = toolbox.minimiseEdit(gold_edit, proc_orig, proc_cor)
							# If minimised to nothing, the edit disappears.
							if not gold_edit: continue
						# Give the edit an automatic error type.
						if not args.old_cats:
							cat = cat_rules.autoTypeEdit(gold_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
							gold_edit[2] = cat
						# Write the edit to the output m2 file.
						out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")
				# Auto edits
				if args.auto:
					# Auto align the parallel sentences and extract the edits.
					auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, args)				
					# Loop through the edits.
					for auto_edit in auto_edits:
						# Give each edit an automatic error type.
						cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
						auto_edit[2] = cat
						# Write the edit to the output m2 file.
						out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n")
		# Write a newline when there are no more coders.
		out_m2.write("\n")
示例#6
0
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading SpaCy...")
    # Load Tokenizer and other resources
    print("Note: disable unecessary pipelines: ner, textcats")
    nlp = spacy.load("en_core_web_lg", disable=['ner', 'textcat'])
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # Moses Detokenizer
    detokenizer = MosesDetokenizer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")
    # Compute missing examples count
    missing_count = 0
    print("Processing files...")
    # Open the original and corrected text files.
    with open(args.orig) as orig, open(args.cor) as cor:
        # Process each pre-aligned sentence pair.
        for orig_sent, cor_sent in tqdm(zip(orig, cor)):
            try:
                # Check sentence length:
                if len(orig_sent.strip().split()) < 3:
                    raise Exception('Source sentence is too short.')
                if len(cor_sent.strip().split()) < 3:
                    raise Exception('Target sentence is too short.')
                # Detokenize sents if they're pre-tokenized. Otherwise the result will be wrong.
                if args.is_tokenized_orig:
                    orig_sent = detokenizer.detokenize(orig_sent.strip().split(), return_str=True)
                if args.is_tokenized_cor:
                    cor_sent = detokenizer.detokenize(cor_sent.strip().split(), return_str=True)
                # Markup the parallel sentences with spacy (assume tokenized)
                proc_orig = toolbox.applySpacy(orig_sent.strip(), nlp)
                proc_cor = toolbox.applySpacy(cor_sent.strip(), nlp)
                # Write the original sentence to the output m2 file.
                out_m2.write("S " + toolbox.formatProcSent(proc_orig, feature_delimiter=args.feature_delimiter) + "\n")
                out_m2.write("T " + toolbox.formatProcSent(proc_cor, feature_delimiter=args.feature_delimiter) + "\n")
                # Identical sentences have no edits, so just write noop.
                if orig_sent.strip() == cor_sent.strip():
                    out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
                # Otherwise, do extra processing.
                else:
                    # Auto align the parallel sentences and extract the edits.
                    auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, nlp, args)
                    # Loop through the edits.
                    for auto_edit in auto_edits:
                        # Give each edit an automatic error type.
                        cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
                        auto_edit[2] = cat
                        # Write the edit to the output m2 file.
                        out_m2.write(toolbox.formatEdit(auto_edit)+"\n")
                # Write a newline when there are no more edits.
                out_m2.write("\n")
            except KeyboardInterrupt:
                sys.exit(1)
            except:
                missing_count += 1
                print('\nMissing count:', missing_count)
                print('- Source: ', orig_sent)
                print('- Target: ', cor_sent)
                print()
                continue
示例#7
0
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")

    print("Processing files...")
    # Open the original and corrected text files.
    src_dict = defaultdict(
        list
    )  # Dict to store Source sentences as keys and edits as values , including multiple annotators for the same source sentence.
    src_line_present = False

    with open(args.orig) as orig, open(args.cor) as cor:
        # Process each pre-aligned sentence pair.
        for orig_sent, cor_sent in zip(orig, cor):
            # Write the original sentence to the output m2 file.
            #out_m2.write("S "+orig_sent)
            src_sent = "S " + orig_sent
            if src_sent not in src_dict.keys():
                src_dict[src_sent] = []
                src_line_present = False  # Boolean variable to check if source sentence already present in dictionary
                src_line = src_sent
                src_lp_count = 0  # Variable to store how many times source line is already present.
                #src_lp_count also keeps track of annotator IDs to be written to m2 file.
            else:
                src_line_present = True
                src_line = src_sent
                src_lp_count += 1
            # Identical sentences have no edits, so just write noop.
            if orig_sent.strip() == cor_sent.strip():
                #out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
                src_dict[src_sent].append(
                    "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
            # Otherwise, do extra processing.
            else:
                # Markup the parallel sentences with spacy (assume tokenized)
                proc_orig = toolbox.applySpacy(orig_sent.strip().split(), nlp)
                proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp)
                # Auto align the parallel sentences and extract the edits.
                auto_edits = align_text.getAutoAlignedEdits(
                    proc_orig, proc_cor, nlp, args)
                # Loop through the edits.
                for auto_edit in auto_edits:
                    # Give each edit an automatic error type.
                    cat = cat_rules.autoTypeEdit(auto_edit, proc_orig,
                                                 proc_cor, gb_spell, tag_map,
                                                 nlp, stemmer)
                    auto_edit[2] = cat
                    # Write the edit to the output m2 file.
                    edit_to_be_written = toolbox.formatEdit(auto_edit)
                    if (src_line_present == False):
                        src_dict[src_line].append(edit_to_be_written + "\n")
                    else:
                        src_dict[src_line].append(edit_to_be_written[:-1] +
                                                  str(src_lp_count) + "\n")
    # Finally write the source sentences(keys) and edits(values) to the m2 file
    for source_sent in src_dict.copy():
        out_m2.write(source_sent)
        for k in range(len(src_dict[source_sent])):
            out_m2.write(src_dict[source_sent][k])
        out_m2.write('\n')
示例#8
0
def main(args):
	# Get base working directory.
	basename = os.path.dirname(os.path.realpath(__file__))
	print("Loading resources...")
	# Punctuation normalisation dictionary
	norm_dict = {"’": "'",
				 "´": "'",
				 "‘": "'",
				 "′": "'",
				 "`": "'",
				 '“': '"',
				 '”': '"',
				 '˝': '"',
				 '¨': '"',
				 '„': '"',
				 '『': '"',
				 '』': '"',
				 '–': '-',
				 '—': '-',
				 '―': '-',
				 '¬': '-',
				 '、': ',',
				 ',': ',',
				 ':': ':',
				 ';': ';',
				 '?': '?',
				 '!': '!',
				 'ِ': ' ',
				 '\u200b': ' '}
	norm_dict = {ord(k): v for k, v in norm_dict.items()}
	# Load Tokenizer and other resources
	nlp = spacy.load("en")
	# Lancaster Stemmer
	stemmer = LancasterStemmer()
	# GB English word list (inc -ise and -ize)
	gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
	# Part of speech map file
	tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")
	# Setup output m2 file
	out_m2 = open(args.out, "w")

	print("Preprocessing files...")
	# Open the file
	with open(args.json_file) as data:
		# Process each line
		for line in data:
			# Load the JSON line
			line = json.loads(line)
			# Normalise certain punctuation in the text
			text = line["text"].translate(norm_dict)
			# Store the sentences and edits for all annotators here
			coder_dict = {}
			# Loop through the annotator ids and their edits
			for coder, edits in line["edits"]:
				# Add the coder to the coder_dict if needed
				if coder not in coder_dict: coder_dict[coder] = []
				# Split the essay into paragraphs and update and normalise the char edits
				para_info = getParas(text, edits, norm_dict)
				# Loop through the paragraphs and edits
				for orig_para, para_edits in para_info:
					# Remove unnecessary whitespace from para and update char edits
					orig_para, para_edits = cleanPara(orig_para, para_edits)
					if not orig_para: continue # Ignore empty paras
					# Annotate orig_para with spacy
					orig_para = nlp(orig_para)
					# Convert character edits to token edits
					para_edits = getTokenEdits(orig_para, para_edits, nlp)
					# Split the paragraph into sentences and update tok edits
					sents = getSents(orig_para, para_edits)
					# Save the sents in the coder_dict
					coder_dict[coder].extend(sents)
			# Get the sorted coder ids
			coder_ids = sorted(coder_dict.keys())
			# Loop through the sentences for the first coder
			for sent_id, sent in enumerate(coder_dict[0]):
				# Write the original sentence to the output M2 file
				out_m2.write("S "+" ".join(sent["orig"])+"\n")
				# Annotate the original sentence with spacy
				orig_sent = toolbox.applySpacy(sent["orig"], nlp)
				# Loop through the coders
				for coder in coder_ids:
					# Annotate the corrected sentence with spacy and get the gold edits
					cor_sent = toolbox.applySpacy(coder_dict[coder][sent_id]["cor"], nlp)
					gold_edits = coder_dict[coder][sent_id]["edits"]
					# Gold edits
					if args.gold:
						# Make sure edits are ordered in terms of start, then end offsets.
						gold_edits = sorted(gold_edits, key=itemgetter(0)) # Sort by start offset
						gold_edits = sorted(gold_edits, key=itemgetter(1)) # Sort by end offset
						min_edits = []
						# Loop through the gold edits.
						for gold_edit in gold_edits:
							# Minimise correction (not detection D) edits: e.g. [has eaten -> eaten] = [has -> ε]
							if gold_edit[2] == "C": gold_edit = toolbox.minimiseEdit(gold_edit, orig_sent, cor_sent)
							# Classify and save non-empty edits
							if gold_edit:
								cat = cat_rules.autoTypeEdit(gold_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer)
								gold_edit[2] = cat
								min_edits.append(gold_edit)
						# If there are no minimised edits, write an explicit empty edit
						if not min_edits:
							out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n")
						# Otherwise loop through the edits and write them to the output m2 file.
						for edit in min_edits: out_m2.write(toolbox.formatEdit(edit, coder)+"\n")
					# Auto edits
					elif args.auto:
						# Auto align the parallel sentences and extract the edits.
						auto_edits = align_text.getAutoAlignedEdits(orig_sent, cor_sent, nlp, args)
						# If there are no edits, write an explicit noop edit.
						if not auto_edits:
							out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n")
						# Loop through the edits.
						for auto_edit in auto_edits:
							# Give each edit an automatic error type.
							cat = cat_rules.autoTypeEdit(auto_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer)
							auto_edit[2] = cat
							# Write the edit to the output m2 file.
							out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n")
				# Write new line after each sentence when we reach last coder.
				out_m2.write("\n")
示例#9
0
def main(args):
	# Get base working directory.
	basename = os.path.dirname(os.path.realpath(__file__))
	print("Loading resources...")
	# Load Tokenizer and other resources
	nlp = spacy.load("en")
	# Lancaster Stemmer
	stemmer = LancasterStemmer()
	# GB English word list (inc -ise and -ize)
	gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
	# Part of speech map file
	tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")	
	# Setup output m2 file
	out_m2 = open(args.out, "w")

	print("Processing files...")
	# Open the m2 file and split into sentence+edit chunks.
	m2_file = open(args.m2).read().strip().split("\n\n")
	for info in m2_file:
		# Get the original and corrected sentence + edits for each annotator.
		orig_sent, coder_dict = toolbox.processM2(info)
		# Write the orig_sent to the output m2 file.
		out_m2.write("S "+" ".join(orig_sent)+"\n")
		# Markup the original sentence with spacy (assume tokenized)
		proc_orig = toolbox.applySpacy(orig_sent, nlp)
		# Loop through the annotators
		for coder, coder_info in sorted(coder_dict.items()):
			cor_sent = coder_info[0]
			gold_edits = coder_info[1]
			# Markup the corrected sentence with spacy (assume tokenized)
			proc_cor = toolbox.applySpacy(cor_sent, nlp)
			# Gold edits
			if args.gold:
				# Loop through the gold edits.
				for gold_edit in gold_edits:
					# Write noop edits to the output m2 file.
					if gold_edit[2] == "noop":
						out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")
						continue
					# Minimise the edit; e.g. [has eaten -> was eaten] = [has -> was]
					if not args.max_edits:
						gold_edit = toolbox.minimiseEdit(gold_edit, proc_orig, proc_cor)
						# If minimised to nothing, the edit disappears.
						if not gold_edit: continue
					# Give the edit an automatic error type.
					if not args.old_cats:
						cat = cat_rules.autoTypeEdit(gold_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
						gold_edit[2] = cat
					# Write the edit to the output m2 file.
					out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")
			# Auto edits
			elif args.auto:
				# Auto align the parallel sentences and extract the edits.
				auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, nlp, args)				
				# If there are no edits, write an explicit noop edit.
				if not auto_edits:
					out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n")
				# Loop through the edits.
				for auto_edit in auto_edits:
					# Give each edit an automatic error type.
					cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
					auto_edit[2] = cat
					# Write the edit to the output m2 file.
					out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n")
		# Write a newline when there are no more coders.
		out_m2.write("\n")
示例#10
0
def get_preprocess_text():
    if request.method == 'POST':
        in_str = request.form.get("sen_input")
        input_sen = in_str  #"If your genetic results indicate that you have gene changes associated with an increased risk of heart disease , it does not mean that you definitely will develop heart disease ."
        words = input_sen.split()
        totals = []
        candidate_words = 10
        delset = string.punctuation
        for each_word in words:
            if each_word in delset:
                totals.append([each_word])
                continue
            if spell_checker.check(each_word):
                #totals.append([each_word])
                totals.append(
                    spell_checker.suggest(each_word)[:candidate_words])
            else:
                totals.append(
                    spell_checker.suggest(each_word)[:candidate_words])
        print(totals)
        cur = []
        prev = [""]
        for i in range(len(totals)):
            for item in prev:
                for j in range(len(totals[i])):
                    cur.append((item + ' ' + totals[i][j]).strip())
            prev = cur
            cur = []

        outputs, ori_scores = model_predict(prev, models, generator,
                                            align_dict, max_positions, args,
                                            use_cuda, task, src_dict, tgt_dict)
        score_dict = dict()
        for ind, output in enumerate(outputs):
            s0 = ori_scores[ind]
            s1 = [
                float(item)
                for item in ed.get_score(input_sen, output).split()
            ]
            s2 = float(lm.get_score(input_sen, output))
            s3 = float(wp.get_score(input_sen, output))
            final_score = s0 * weights[0] + s1[0] * weights[1] + s1[
                1] * weights[2] + s1[2] * weights[3] + s2 * weights[
                    4] + s2 * weights[5]
            score_dict[ind] = final_score
            print(s0, s1[0], s1[1], s1[2], s2, s3)
        sorted_indices = sorted(score_dict, key=score_dict.get, reverse=True)
        out_type = []
        for ind in sorted_indices:
            proc_orig = toolbox.applySpacy(input_sen.split(), nlp)
            output_type = '\n'
            cor_sent = outputs[ind]
            cor_sent = cor_sent.strip()
            # Identical sentences have no edits, so just write noop.
            if input_sen == cor_sent:
                output_type += "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||" + "\n"
            # Otherwise, do extra processing.
            else:
                # Markup the corrected sentence with spacy (assume tokenized)
                proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp)
                # Auto align the parallel sentences and extract the edits.
                auto_edits = align_text.getAutoAlignedEdits(
                    proc_orig, proc_cor, nlp, True, 'rules')
                # Loop through the edits.
                for auto_edit in auto_edits:
                    # Give each edit an automatic error type.
                    cat = cat_rules.autoTypeEdit(auto_edit, proc_orig,
                                                 proc_cor, gb_spell, tag_map,
                                                 nlp, stemmer)
                    auto_edit[2] = cat
                    # Write the edit to the output m2 file.
                    output_type += toolbox.formatEdit(auto_edit, 0) + "\n"
            out_type.append(output_type)
            print(outputs[ind])
        couplet_res = outputs[sorted_indices[0]] + out_type[0]
        sys.stdout.flush()
        return render_template('show.html',
                               sen_input=input_sen,
                               sen_res=couplet_res)
    else:
        return render_template('index.html')