예제 #1
0
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")

    # ExitStack lets us process an arbitrary number of files line by line simultaneously.
    # See https://stackoverflow.com/questions/24108769/how-to-read-and-process-multiple-files-simultaneously-in-python
    print("Processing files...")
    with ExitStack() as stack:
        in_files = [
            stack.enter_context(open(i)) for i in [args.orig] + args.cor
        ]
        # Process each line of all input files.
        for line_id, line in enumerate(zip(*in_files)):
            orig_sent = line[0].strip()
            cor_sents = line[1:]
            # If orig sent is empty, skip the line
            if not orig_sent: continue
            # Write the original sentence to the output m2 file.
            out_m2.write("S " + orig_sent + "\n")
            # Markup the original sentence with spacy (assume tokenized)
            proc_orig = toolbox.applySpacy(orig_sent.split(), nlp)
            # Loop through the corrected sentences
            for cor_id, cor_sent in enumerate(cor_sents):
                cor_sent = cor_sent.strip()
                # Identical sentences have no edits, so just write noop.
                if orig_sent == cor_sent:
                    out_m2.write(
                        "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||" +
                        str(cor_id) + "\n")
                # Otherwise, do extra processing.
                else:
                    # Markup the corrected sentence with spacy (assume tokenized)
                    proc_cor = toolbox.applySpacy(cor_sent.strip().split(),
                                                  nlp)
                    # Auto align the parallel sentences and extract the edits.
                    auto_edits = align_text.getAutoAlignedEdits(
                        proc_orig, proc_cor, nlp, args)
                    # Loop through the edits.
                    for auto_edit in auto_edits:
                        # Give each edit an automatic error type.
                        cat = cat_rules.autoTypeEdit(auto_edit, proc_orig,
                                                     proc_cor, gb_spell,
                                                     tag_map, nlp, stemmer)
                        auto_edit[2] = cat
                        # Write the edit to the output m2 file.
                        out_m2.write(
                            toolbox.formatEdit(auto_edit, cor_id) + "\n")
            # Write a newline when we have processed all corrections for a given sentence.
            out_m2.write("\n")
예제 #2
0
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")

    print("Processing files...")
    # Open the original and corrected text files.
    with open(args.orig) as orig, open(args.cor) as cor:
        # Process each pre-aligned sentence pair.
        for orig_sent, cor_sent in zip(orig, cor):
            # Write the original sentence to the output m2 file.
            out_m2.write("S " + orig_sent)
            # Identical sentences have no edits, so just write noop.
            if orig_sent.strip() == cor_sent.strip():
                out_m2.write(
                    "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
            # Otherwise, do extra processing.
            else:
                # Markup the parallel sentences with spacy (assume tokenized)
                proc_orig = toolbox.applySpacy(orig_sent.strip().split(), nlp)
                proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp)
                # Auto align the parallel sentences and extract the edits.
                auto_edits = align_text.getAutoAlignedEdits(
                    proc_orig, proc_cor, nlp, args)
                # Loop through the edits.
                for auto_edit in auto_edits:
                    # Give each edit an automatic error type.
                    cat = cat_rules.autoTypeEdit(auto_edit, proc_orig,
                                                 proc_cor, gb_spell, tag_map,
                                                 nlp, stemmer)
                    auto_edit[2] = cat
                    # Write the edit to the output m2 file.
                    out_m2.write(toolbox.formatEdit(auto_edit) + "\n")
            # Write a newline when there are no more edits.
            out_m2.write("\n")
예제 #3
0
파일: m2_to_m2.py 프로젝트: xiyang85/errant
def main(args):
	# Get base working directory.
	basename = os.path.dirname(os.path.realpath(__file__))
	print("Loading resources...")
	# Load Tokenizer and other resources
	nlp = spacy.load("en")
	# Lancaster Stemmer
	stemmer = LancasterStemmer()
	# GB English word list (inc -ise and -ize)
	gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
	# Part of speech map file
	tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")	
	# Setup output m2 file
	out_m2 = open(args.out, "w")

	print("Processing files...")
	# Open the m2 file and split into sentence+edit chunks.
	m2_file = open(args.m2).read().strip().split("\n\n")
	for info in m2_file:
		# Get the original and corrected sentence + edits for each annotator.
		orig_sent, coder_dict = toolbox.processM2(info)
		# Write the orig_sent to the output m2 file.
		out_m2.write("S "+" ".join(orig_sent)+"\n")
		# Only process sentences with edits.
		if coder_dict:
			# Save marked up original sentence here, if required.
			proc_orig = ""
			# Loop through the annotators
			for coder, coder_info in sorted(coder_dict.items()):
				cor_sent = coder_info[0]
				gold_edits = coder_info[1]
				# If there is only 1 edit and it is noop, just write it.
				if gold_edits[0][2] == "noop":
					out_m2.write(toolbox.formatEdit(gold_edits[0], coder)+"\n")				
					continue
				# Markup the orig and cor sentence with spacy (assume tokenized)
				# Orig is marked up only once for the first coder that needs it.
				proc_orig = toolbox.applySpacy(orig_sent, nlp) if not proc_orig else proc_orig
				proc_cor = toolbox.applySpacy(cor_sent, nlp)
				# Loop through gold edits.
				for gold_edit in gold_edits:
					# Um and UNK edits (uncorrected errors) are always preserved.
					if gold_edit[2] in {"Um", "UNK"}:
						# Um should get changed to UNK unless using old categories.
						if gold_edit[2] == "Um" and not args.old_cats: gold_edit[2] = "UNK"
						out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")				
					# Gold edits
					elif args.gold:
						# Minimise the edit; e.g. [has eaten -> was eaten] = [has -> was]
						if not args.max_edits:
							gold_edit = toolbox.minimiseEdit(gold_edit, proc_orig, proc_cor)
							# If minimised to nothing, the edit disappears.
							if not gold_edit: continue
						# Give the edit an automatic error type.
						if not args.old_cats:
							cat = cat_rules.autoTypeEdit(gold_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
							gold_edit[2] = cat
						# Write the edit to the output m2 file.
						out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")
				# Auto edits
				if args.auto:
					# Auto align the parallel sentences and extract the edits.
					auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, args)				
					# Loop through the edits.
					for auto_edit in auto_edits:
						# Give each edit an automatic error type.
						cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
						auto_edit[2] = cat
						# Write the edit to the output m2 file.
						out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n")
		# Write a newline when there are no more coders.
		out_m2.write("\n")
예제 #4
0
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading SpaCy...")
    # Load Tokenizer and other resources
    print("Note: disable unecessary pipelines: ner, textcats")
    nlp = spacy.load("en_core_web_lg", disable=['ner', 'textcat'])
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # Moses Detokenizer
    detokenizer = MosesDetokenizer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")
    # Compute missing examples count
    missing_count = 0
    print("Processing files...")
    # Open the original and corrected text files.
    with open(args.orig) as orig, open(args.cor) as cor:
        # Process each pre-aligned sentence pair.
        for orig_sent, cor_sent in tqdm(zip(orig, cor)):
            try:
                # Check sentence length:
                if len(orig_sent.strip().split()) < 3:
                    raise Exception('Source sentence is too short.')
                if len(cor_sent.strip().split()) < 3:
                    raise Exception('Target sentence is too short.')
                # Detokenize sents if they're pre-tokenized. Otherwise the result will be wrong.
                if args.is_tokenized_orig:
                    orig_sent = detokenizer.detokenize(orig_sent.strip().split(), return_str=True)
                if args.is_tokenized_cor:
                    cor_sent = detokenizer.detokenize(cor_sent.strip().split(), return_str=True)
                # Markup the parallel sentences with spacy (assume tokenized)
                proc_orig = toolbox.applySpacy(orig_sent.strip(), nlp)
                proc_cor = toolbox.applySpacy(cor_sent.strip(), nlp)
                # Write the original sentence to the output m2 file.
                out_m2.write("S " + toolbox.formatProcSent(proc_orig, feature_delimiter=args.feature_delimiter) + "\n")
                out_m2.write("T " + toolbox.formatProcSent(proc_cor, feature_delimiter=args.feature_delimiter) + "\n")
                # Identical sentences have no edits, so just write noop.
                if orig_sent.strip() == cor_sent.strip():
                    out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
                # Otherwise, do extra processing.
                else:
                    # Auto align the parallel sentences and extract the edits.
                    auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, nlp, args)
                    # Loop through the edits.
                    for auto_edit in auto_edits:
                        # Give each edit an automatic error type.
                        cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
                        auto_edit[2] = cat
                        # Write the edit to the output m2 file.
                        out_m2.write(toolbox.formatEdit(auto_edit)+"\n")
                # Write a newline when there are no more edits.
                out_m2.write("\n")
            except KeyboardInterrupt:
                sys.exit(1)
            except:
                missing_count += 1
                print('\nMissing count:', missing_count)
                print('- Source: ', orig_sent)
                print('- Target: ', cor_sent)
                print()
                continue
예제 #5
0
from tqdm import tqdm
import sys
from joblib import Parallel, delayed

# Get base working directory.
basename = os.path.dirname(os.path.realpath(__file__))
print("Loading SpaCy...")
# Load Tokenizer and other resources
print("Note: disable unecessary pipelines: ner, textcats")
nlp = spacy.load("en_core_web_lg", disable=['ner', 'textcat'])
# Lancaster Stemmer
stemmer = LancasterStemmer()
# Moses Detokenizer
detokenizer = MosesDetokenizer()
# GB English word list (inc -ise and -ize)
gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt")
# Part of speech map file
tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")


def _generate_m2(orig_sent, cor_sent):
    ignore_count = 0
    out_m2_str = ''
    # Process each pre-aligned sentence pair.
    try:
        # Check sentence length:
        if len(orig_sent.strip().split()) < 3:
            raise Exception('Source sentence is too short.')
        if len(cor_sent.strip().split()) < 3:
            raise Exception('Target sentence is too short.')
예제 #6
0
def main(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    print("Loading resources...")
    # Load Tokenizer and other resources
    nlp = spacy.load("en")
    # Lancaster Stemmer
    stemmer = LancasterStemmer()
    # GB English word list (inc -ise and -ize)
    gb_spell = toolbox.loadDictionary(basename + "/resources/en_GB-large.txt")
    # Part of speech map file
    tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")
    # Setup output m2 file
    out_m2 = open(args.out, "w")

    print("Processing files...")
    # Open the original and corrected text files.
    src_dict = defaultdict(
        list
    )  # Dict to store Source sentences as keys and edits as values , including multiple annotators for the same source sentence.
    src_line_present = False

    with open(args.orig) as orig, open(args.cor) as cor:
        # Process each pre-aligned sentence pair.
        for orig_sent, cor_sent in zip(orig, cor):
            # Write the original sentence to the output m2 file.
            #out_m2.write("S "+orig_sent)
            src_sent = "S " + orig_sent
            if src_sent not in src_dict.keys():
                src_dict[src_sent] = []
                src_line_present = False  # Boolean variable to check if source sentence already present in dictionary
                src_line = src_sent
                src_lp_count = 0  # Variable to store how many times source line is already present.
                #src_lp_count also keeps track of annotator IDs to be written to m2 file.
            else:
                src_line_present = True
                src_line = src_sent
                src_lp_count += 1
            # Identical sentences have no edits, so just write noop.
            if orig_sent.strip() == cor_sent.strip():
                #out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
                src_dict[src_sent].append(
                    "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\n")
            # Otherwise, do extra processing.
            else:
                # Markup the parallel sentences with spacy (assume tokenized)
                proc_orig = toolbox.applySpacy(orig_sent.strip().split(), nlp)
                proc_cor = toolbox.applySpacy(cor_sent.strip().split(), nlp)
                # Auto align the parallel sentences and extract the edits.
                auto_edits = align_text.getAutoAlignedEdits(
                    proc_orig, proc_cor, nlp, args)
                # Loop through the edits.
                for auto_edit in auto_edits:
                    # Give each edit an automatic error type.
                    cat = cat_rules.autoTypeEdit(auto_edit, proc_orig,
                                                 proc_cor, gb_spell, tag_map,
                                                 nlp, stemmer)
                    auto_edit[2] = cat
                    # Write the edit to the output m2 file.
                    edit_to_be_written = toolbox.formatEdit(auto_edit)
                    if (src_line_present == False):
                        src_dict[src_line].append(edit_to_be_written + "\n")
                    else:
                        src_dict[src_line].append(edit_to_be_written[:-1] +
                                                  str(src_lp_count) + "\n")
    # Finally write the source sentences(keys) and edits(values) to the m2 file
    for source_sent in src_dict.copy():
        out_m2.write(source_sent)
        for k in range(len(src_dict[source_sent])):
            out_m2.write(src_dict[source_sent][k])
        out_m2.write('\n')
예제 #7
0
def main(args):
	# Get base working directory.
	basename = os.path.dirname(os.path.realpath(__file__))
	print("Loading resources...")
	# Punctuation normalisation dictionary
	norm_dict = {"’": "'",
				 "´": "'",
				 "‘": "'",
				 "′": "'",
				 "`": "'",
				 '“': '"',
				 '”': '"',
				 '˝': '"',
				 '¨': '"',
				 '„': '"',
				 '『': '"',
				 '』': '"',
				 '–': '-',
				 '—': '-',
				 '―': '-',
				 '¬': '-',
				 '、': ',',
				 ',': ',',
				 ':': ':',
				 ';': ';',
				 '?': '?',
				 '!': '!',
				 'ِ': ' ',
				 '\u200b': ' '}
	norm_dict = {ord(k): v for k, v in norm_dict.items()}
	# Load Tokenizer and other resources
	nlp = spacy.load("en")
	# Lancaster Stemmer
	stemmer = LancasterStemmer()
	# GB English word list (inc -ise and -ize)
	gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
	# Part of speech map file
	tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")
	# Setup output m2 file
	out_m2 = open(args.out, "w")

	print("Preprocessing files...")
	# Open the file
	with open(args.json_file) as data:
		# Process each line
		for line in data:
			# Load the JSON line
			line = json.loads(line)
			# Normalise certain punctuation in the text
			text = line["text"].translate(norm_dict)
			# Store the sentences and edits for all annotators here
			coder_dict = {}
			# Loop through the annotator ids and their edits
			for coder, edits in line["edits"]:
				# Add the coder to the coder_dict if needed
				if coder not in coder_dict: coder_dict[coder] = []
				# Split the essay into paragraphs and update and normalise the char edits
				para_info = getParas(text, edits, norm_dict)
				# Loop through the paragraphs and edits
				for orig_para, para_edits in para_info:
					# Remove unnecessary whitespace from para and update char edits
					orig_para, para_edits = cleanPara(orig_para, para_edits)
					if not orig_para: continue # Ignore empty paras
					# Annotate orig_para with spacy
					orig_para = nlp(orig_para)
					# Convert character edits to token edits
					para_edits = getTokenEdits(orig_para, para_edits, nlp)
					# Split the paragraph into sentences and update tok edits
					sents = getSents(orig_para, para_edits)
					# Save the sents in the coder_dict
					coder_dict[coder].extend(sents)
			# Get the sorted coder ids
			coder_ids = sorted(coder_dict.keys())
			# Loop through the sentences for the first coder
			for sent_id, sent in enumerate(coder_dict[0]):
				# Write the original sentence to the output M2 file
				out_m2.write("S "+" ".join(sent["orig"])+"\n")
				# Annotate the original sentence with spacy
				orig_sent = toolbox.applySpacy(sent["orig"], nlp)
				# Loop through the coders
				for coder in coder_ids:
					# Annotate the corrected sentence with spacy and get the gold edits
					cor_sent = toolbox.applySpacy(coder_dict[coder][sent_id]["cor"], nlp)
					gold_edits = coder_dict[coder][sent_id]["edits"]
					# Gold edits
					if args.gold:
						# Make sure edits are ordered in terms of start, then end offsets.
						gold_edits = sorted(gold_edits, key=itemgetter(0)) # Sort by start offset
						gold_edits = sorted(gold_edits, key=itemgetter(1)) # Sort by end offset
						min_edits = []
						# Loop through the gold edits.
						for gold_edit in gold_edits:
							# Minimise correction (not detection D) edits: e.g. [has eaten -> eaten] = [has -> ε]
							if gold_edit[2] == "C": gold_edit = toolbox.minimiseEdit(gold_edit, orig_sent, cor_sent)
							# Classify and save non-empty edits
							if gold_edit:
								cat = cat_rules.autoTypeEdit(gold_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer)
								gold_edit[2] = cat
								min_edits.append(gold_edit)
						# If there are no minimised edits, write an explicit empty edit
						if not min_edits:
							out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n")
						# Otherwise loop through the edits and write them to the output m2 file.
						for edit in min_edits: out_m2.write(toolbox.formatEdit(edit, coder)+"\n")
					# Auto edits
					elif args.auto:
						# Auto align the parallel sentences and extract the edits.
						auto_edits = align_text.getAutoAlignedEdits(orig_sent, cor_sent, nlp, args)
						# If there are no edits, write an explicit noop edit.
						if not auto_edits:
							out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n")
						# Loop through the edits.
						for auto_edit in auto_edits:
							# Give each edit an automatic error type.
							cat = cat_rules.autoTypeEdit(auto_edit, orig_sent, cor_sent, gb_spell, tag_map, nlp, stemmer)
							auto_edit[2] = cat
							# Write the edit to the output m2 file.
							out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n")
				# Write new line after each sentence when we reach last coder.
				out_m2.write("\n")
예제 #8
0
def main(args):
	# Get base working directory.
	basename = os.path.dirname(os.path.realpath(__file__))
	print("Loading resources...")
	# Load Tokenizer and other resources
	nlp = spacy.load("en")
	# Lancaster Stemmer
	stemmer = LancasterStemmer()
	# GB English word list (inc -ise and -ize)
	gb_spell = toolbox.loadDictionary(basename+"/resources/en_GB-large.txt")
	# Part of speech map file
	tag_map = toolbox.loadTagMap(basename+"/resources/en-ptb_map")	
	# Setup output m2 file
	out_m2 = open(args.out, "w")

	print("Processing files...")
	# Open the m2 file and split into sentence+edit chunks.
	m2_file = open(args.m2).read().strip().split("\n\n")
	for info in m2_file:
		# Get the original and corrected sentence + edits for each annotator.
		orig_sent, coder_dict = toolbox.processM2(info)
		# Write the orig_sent to the output m2 file.
		out_m2.write("S "+" ".join(orig_sent)+"\n")
		# Markup the original sentence with spacy (assume tokenized)
		proc_orig = toolbox.applySpacy(orig_sent, nlp)
		# Loop through the annotators
		for coder, coder_info in sorted(coder_dict.items()):
			cor_sent = coder_info[0]
			gold_edits = coder_info[1]
			# Markup the corrected sentence with spacy (assume tokenized)
			proc_cor = toolbox.applySpacy(cor_sent, nlp)
			# Gold edits
			if args.gold:
				# Loop through the gold edits.
				for gold_edit in gold_edits:
					# Write noop edits to the output m2 file.
					if gold_edit[2] == "noop":
						out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")
						continue
					# Minimise the edit; e.g. [has eaten -> was eaten] = [has -> was]
					if not args.max_edits:
						gold_edit = toolbox.minimiseEdit(gold_edit, proc_orig, proc_cor)
						# If minimised to nothing, the edit disappears.
						if not gold_edit: continue
					# Give the edit an automatic error type.
					if not args.old_cats:
						cat = cat_rules.autoTypeEdit(gold_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
						gold_edit[2] = cat
					# Write the edit to the output m2 file.
					out_m2.write(toolbox.formatEdit(gold_edit, coder)+"\n")
			# Auto edits
			elif args.auto:
				# Auto align the parallel sentences and extract the edits.
				auto_edits = align_text.getAutoAlignedEdits(proc_orig, proc_cor, nlp, args)				
				# If there are no edits, write an explicit noop edit.
				if not auto_edits:
					out_m2.write("A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(coder)+"\n")
				# Loop through the edits.
				for auto_edit in auto_edits:
					# Give each edit an automatic error type.
					cat = cat_rules.autoTypeEdit(auto_edit, proc_orig, proc_cor, gb_spell, tag_map, nlp, stemmer)
					auto_edit[2] = cat
					# Write the edit to the output m2 file.
					out_m2.write(toolbox.formatEdit(auto_edit, coder)+"\n")
		# Write a newline when there are no more coders.
		out_m2.write("\n")
예제 #9
0
lm = LM('LM0',
        '/data/wangzhe/SematicSeg/mlconvgec2018/models/lm/94Bcclm.trie',
        normalize=False)
wp = WordPenalty(name='WordPenalty0')
weights = [0.94064, -0.0208803, -0.00450021, 0.015532, 0.00618153, -0.0122658]
models, generator, align_dict, max_positions, args, use_cuda, task, src_dict, tgt_dict = load(
    model_names=model_names, use_cpu=False)

basename = os.path.dirname(os.path.realpath(__file__))

# Load Tokenizer and other resources
nlp = spacy.load("en")
# Lancaster Stemmer
stemmer = LancasterStemmer()
# GB English word list (inc -ise and -ize)
gb_spell = toolbox.loadDictionary(
    '/data/wangzhe/SematicSeg/mlconvgec2018/models/data_bin/dict.load.txt')
# Part of speech map file
tag_map = toolbox.loadTagMap(basename + "/resources/en-ptb_map")
print("Loading resources...")


def cross_make_response(json_info):
    response = make_response(json_info)
    response.headers['Content-Type'] = 'text/html'
    response.headers['Access-Control-Allow-Origin'] = '*'
    response.headers['Access-Control-Allow-Methods'] = 'OPTIONS,HEAD,POST,GET'
    response.headers[
        'Access-Control-Allow-Headers'] = 'x-requested-with,content-type'
    return response