def load_annotation(gold_file):
    source_sentences = []
    gold_edits = []
    fgold = smart_open(gold_file, 'r')
    puffer = fgold.read()
    fgold.close()
    puffer = puffer.decode('utf8')
    for item in paragraphs(puffer.splitlines(True)):
        item = item.splitlines(False)
        sentence = [line[2:].strip() for line in item if line.startswith('S ')]
        assert sentence != []
        annotation = []
        for line in item[1:]:
            if line.startswith('I ') or line.startswith('S '):
                continue
            assert line.startswith('A ')
            line = line[2:]
            fields = line.split('|||')
            start_offset = int(fields[0].split()[0])
            end_offset = int(fields[0].split()[1])
            etype = fields[1]
            corrections =  [c.strip() if c != '-NONE-' else '' for c in fields[2].split('||')]
            # NOTE: start and end are *token* offsets
            original = ' '.join(' '.join(sentence).split()[start_offset:end_offset])
            annotation.append((start_offset, end_offset, original, corrections))
        tok_offset = 0
        for this_sentence in sentence:
            tok_offset += len(this_sentence.split())
            this_edits = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset]
            source_sentences.append(this_sentence)
            gold_edits.append(this_edits)
    return (source_sentences, gold_edits)
예제 #2
0
def load_annotation(gold_file):
    source_sentences = []
    gold_edits = []
    fgold = smart_open(gold_file, 'r')
    puffer = fgold.read()
    fgold.close()
    #puffer = puffer.decode('utf8')
    for item in paragraphs(puffer.splitlines(True)):
        item = item.splitlines(False)
        sentence = [line[2:].strip() for line in item if line.startswith('S ')]
        #print(sentence)
        assert sentence != []
        annotations = {}
        for line in item[1:]:
            if line.startswith('I ') or line.startswith('S '):
                continue
            assert line.startswith('A ')
            line = line[2:]
            fields = line.split('|||')
            start_offset = int(fields[0].split()[0])
            end_offset = int(fields[0].split()[1])
            etype = fields[1]
            if etype == 'noop':
                start_offset = -1
                end_offset = -1
            corrections = [
                c.strip() if c != '-NONE-' else ''
                for c in fields[2].split('||')
            ]
            # NOTE: start and end are *token* offsets
            original = ' '.join(
                ' '.join(sentence).split()[start_offset:end_offset])
            annotator = int(fields[5])
            if annotator not in list(annotations.keys()):
                annotations[annotator] = []

#           print(etype, original,corrections)
            annotations[annotator].append(
                (start_offset, end_offset, original, corrections, etype))
        tok_offset = 0
        for this_sentence in sentence:
            tok_offset += len(this_sentence.split())
            source_sentences.append(this_sentence)
            this_edits = {}
            for annotator, annotation in annotations.items():
                this_edits[annotator] = [
                    edit for edit in annotation if edit[0] <= tok_offset
                    and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0
                ]
            if len(this_edits) == 0:
                this_edits[0] = []
            #print(this_edits)
            gold_edits.append(this_edits)
    return (source_sentences, gold_edits)
예제 #3
0
def load_annotation(gold_file):
    source_sentences = []
    gold_edits = []
    fgold = smart_open(gold_file, "r")
    puffer = fgold.read()
    fgold.close()
    puffer = puffer.decode("utf8")
    for item in paragraphs(puffer.splitlines(True)):
        item = item.splitlines(False)
        sentence = [line[2:].strip() for line in item if line.startswith("S ")]
        assert sentence != []
        annotations = {}
        for line in item[1:]:
            if line.startswith("I ") or line.startswith("S "):
                continue
            assert line.startswith("A ")
            line = line[2:]
            fields = line.split("|||")
            start_offset = int(fields[0].split()[0])
            end_offset = int(fields[0].split()[1])
            etype = fields[1]
            if etype == "noop":
                start_offset = -1
                end_offset = -1
            corrections = [c.strip() if c != "-NONE-" else "" for c in fields[2].split("||")]
            # NOTE: start and end are *token* offsets
            original = " ".join(" ".join(sentence).split()[start_offset:end_offset])
            annotator = int(fields[5])
            if annotator not in annotations.keys():
                annotations[annotator] = []
            annotations[annotator].append((start_offset, end_offset, original, corrections))
        tok_offset = 0
        for this_sentence in sentence:
            tok_offset += len(this_sentence.split())
            source_sentences.append(this_sentence)
            this_edits = {}
            for annotator, annotation in annotations.iteritems():
                this_edits[annotator] = [
                    edit
                    for edit in annotation
                    if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0
                ]
            if len(this_edits) == 0:
                this_edits[0] = []
            gold_edits.append(this_edits)
    return (source_sentences, gold_edits)
예제 #4
0
파일: m2scorer.py 프로젝트: xqjin/GEC0Prep
def evaluateIt(system_file,gold_file,verbose=False):
	max_unchanged_words=2
	ignore_whitespace_casing= False
	very_verbose = False
#	opts, args = getopt(sys.argv[1:], "v", ["max_unchanged_words=", "verbose", "ignore_whitespace_casing", "very_verbose"])
#	for o, v in opts:
#		if o in ('-v', '--verbose'):
#			verbose = True
#		elif o == '--very_verbose':
#			very_verbose = True
#		elif o == '--max_unchanged_words':
#			max_unchanged_words = int(v)
#		elif o == '--ignore_whitespace_casing':
#			ignore_whitespace_casing = True
#		else:
#			print >> sys.stderr, "Unknown option :", o
#			print_usage()
#			sys.exit(-1)
#
#	# starting point
#	if len(args) != 2:
#		print_usage()
#		sys.exit(-1)

	#system_file = args[0]
	#gold_file = args[1]

	# load source sentences and gold edits
	source_sentences, gold_edits = load_annotation(gold_file)

	# load system hypotheses
	fin = smart_open(system_file, 'r')
	system_sentences = [line.decode("utf8").strip() for line in fin.readlines()]
	fin.close()

	p, r, f1 = levenshtein.batch_multi_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, ignore_whitespace_casing, verbose, very_verbose)

	#print "Precision   : %.4f" % p
	#print "Recall      : %.4f" % r
	#print "F1          : %.4f" % f1
	return p,r,f1
        max_unchanged_words = int(v)
    elif o == '--ignore_whitespace_casing':
        ignore_whitespace_casing = True
    else:
        print >> sys.stderr, "Unknown option :", o
        print_usage()
        sys.exit(-1)

# starting point
if len(args) != 2:
    print_usage()
    sys.exit(-1)

system_file = args[0]
gold_file = args[1]

# load source sentences and gold edits
source_sentences, gold_edits = load_annotation(gold_file)

# load system hypotheses
fin = smart_open(system_file, 'r')
system_sentences = [line.decode("utf8").strip() for line in fin.readlines()]
fin.close()

p, r, f1 = levenshtein.batch_pre_rec_f1(system_sentences, source_sentences, gold_edits, max_unchanged_words, ignore_whitespace_casing, verbose, very_verbose)

print "Precision   : %.4f" % p
print "Recall      : %.4f" % r
print "F1          : %.4f" % f1

예제 #6
0
    elif o == "--ignore_whitespace_casing":
        ignore_whitespace_casing = True
    else:
        print >> sys.stderr, "Unknown option :", o
        print_usage()
        sys.exit(-1)

# starting point
if len(args) != 2:
    print_usage()
    sys.exit(-1)

system_file = args[0]
gold_file = args[1]

# load source sentences and gold edits
source_sentences, gold_edits = load_annotation(gold_file)

# load system hypotheses
fin = smart_open(system_file, "r")
system_sentences = [line.decode("utf8").strip() for line in fin.readlines()]
fin.close()

p, r, f1 = levenshtein.batch_multi_pre_rec_f1(
    system_sentences, source_sentences, gold_edits, max_unchanged_words, ignore_whitespace_casing, verbose, very_verbose
)

print "Precision   : %.4f" % p
print "Recall      : %.4f" % r
print "F1          : %.4f" % f1
예제 #7
0
    else:
        print("Unknown option :", o, file=sys.stderr)
        print_usage()
        sys.exit(-1)

# starting point
if len(args) != 2:
    print_usage()
    sys.exit(-1)

system_file = args[0]
gold_file = args[1]

# load source sentences and gold edits
source_sentences, gold_edits = load_annotation(gold_file)

# load system hypotheses
fin = smart_open(system_file, mode='r')
system_sentences = [line.strip() for line in fin.readlines()]
fin.close()

p, r, f1 = levenshtein.batch_multi_pre_rec_f1(system_sentences,
                                              source_sentences, gold_edits,
                                              max_unchanged_words, beta,
                                              ignore_whitespace_casing,
                                              verbose, very_verbose)

print(("Precision   : %.4f" % p))
print(("Recall      : %.4f" % r))
print(("F_%.1f       : %.4f" % (beta, f1)))