def _calc_src_buckets_and_matches(self, src_sent, src_label, ref_sent, ref_aligns, out_sents): # Initial setup for special cases if self.case_insensitive: src_sent = [corpus_utils.lower(w) for w in src_sent] ref_sent = [corpus_utils.lower(w) for w in ref_sent] out_sents = [[corpus_utils.lower(w) for w in out_sent] for out_sent in out_sents] if not src_label: src_label = [] # Get matches _, ref_matches = self._calc_trg_matches(ref_sent, out_sents) # Process the source, getting the bucket src_buckets = [self.calc_bucket(w, label=l) for (w,l) in itertools.zip_longest(src_sent, src_label)] # For each source word, find the reference words that need to be correct src_aligns = [[] for _ in src_sent] for src, trg in ref_aligns: src_aligns[src].append(trg) # Calculate totals for each sentence num_buckets = len(self.bucket_strs) num_outs = len(out_sents) my_ref_total = np.zeros(num_buckets ,dtype=int) my_out_matches = np.zeros( (num_outs, num_buckets) ,dtype=int) for src_bucket in src_buckets: my_ref_total[src_bucket] += 1 my_out_totals = np.broadcast_to(np.reshape(my_ref_total, (1, num_buckets)), (num_outs, num_buckets)) for oai, (out_sent, ref_match) in enumerate(zip(out_sents, ref_matches)): for src_bucket, src_align in zip(src_buckets, src_aligns): if len(src_align) != 0: if all([ref_match[x] >= 0 for x in src_align]): my_out_matches[oai,src_bucket] += 1 return my_ref_total, my_out_totals, my_out_matches, src_buckets, src_aligns, ref_matches
def _edit_distance(self, ref, out): if self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) sp1 = len(ref) + 1 tp1 = len(out) + 1 scores = np.zeros((sp1, tp1)) equals = (np.expand_dims(np.array(ref), axis=1) == np.array(out)) scores[:, 0] = range(sp1) scores[0, :] = range(tp1) # Forward edit distance for i in range(0, len(ref)): for j in range(0, len(out)): my_action = 0 if equals[i, j] else 1 my_score = scores[i, j] + my_action * self.sub_pen del_score = scores[i, j + 1] + self.del_pen if del_score < my_score: my_score = del_score ins_score = scores[i + 1, j] + self.ins_pen if ins_score < my_score: my_score = ins_score scores[i + 1, j + 1] = my_score return scores[-1, -1]
def _calc_trg_buckets_and_matches(self, ref_sent, ref_label, out_sents, out_labels): # Initial setup for special cases if self.case_insensitive: ref_sent = [corpus_utils.lower(w) for w in ref_sent] out_sents = [[corpus_utils.lower(w) for w in out_sent] for out_sent in out_sents] if not ref_label: ref_label = [] out_labels = [[] for _ in out_sents] # Get matches out_matches, _ = self._calc_trg_matches(ref_sent, out_sents) # Process the reference, getting the bucket ref_buckets = [self.calc_bucket(w, label=l) for (w,l) in itertools.zip_longest(ref_sent, ref_label)] # Process each of the outputs, finding matches out_buckets = [[] for _ in out_sents] for oai, (out_sent, out_label, match, out_buck) in \ enumerate(itertools.zip_longest(out_sents, out_labels, out_matches, out_buckets)): for oi, (w, l, m) in enumerate(itertools.zip_longest(out_sent, out_label, match)): out_buck.append(self.calc_bucket(w, label=l) if m < 0 else ref_buckets[m]) # Calculate totals for each sentence num_buckets = len(self.bucket_strs) num_outs = len(out_sents) my_ref_total = np.zeros(num_buckets ,dtype=int) my_out_totals = np.zeros( (num_outs, num_buckets) ,dtype=int) my_out_matches = np.zeros( (num_outs, num_buckets) ,dtype=int) for b in ref_buckets: my_ref_total[b] += 1 for oi, (obs, ms) in enumerate(zip(out_buckets, out_matches)): for b, m in zip(obs, ms): my_out_totals[oi,b] += 1 if m >= 0: my_out_matches[oi,b] += 1 return my_ref_total, my_out_totals, my_out_matches, ref_buckets, out_buckets, out_matches
def cache_stats(self, ref, out, src=None): """ Cache sufficient statistics for caculating BLEU score Args: ref: A reference corpus out: An output corpus src: A source courpus. Ignored if passed Returns: A list of cached statistics """ if self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) cached_stats = [] for r, o in zip(ref, out): prec = [] for n in range(1, len(self.weights) + 1): prec.append(self._precision(r, o, n)) cached_stats.append((len(r), len(o), prec)) return cached_stats
def score_sentence(self, ref, out): if self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) if self._stemmer: ref = [self._stemmer.stem(x) if len(x) > 3 else x for x in ref] out = [self._stemmer.stem(x) if len(x) > 3 else x for x in out] if self.rouge_type == 'rougeL': scores = rouge_scorer._score_lcs(ref, out) elif re.match(r"rouge[0-9]$", self.rouge_type): n = int(self.rouge_type[5:]) if n <= 0: raise ValueError( f"rougen requires positive n: {self.rouge_type}") ref_ngrams = rouge_scorer._create_ngrams(ref, n) out_ngrams = rouge_scorer._create_ngrams(out, n) scores = rouge_scorer._score_ngrams(ref_ngrams, out_ngrams) else: raise ValueError(f"Invalid rouge type: {self.rouge_type}") if self.score_type == 'fmeasure': return scores.fmeasure, None elif self.score_type == 'precision': return scores.precision, None elif self.score_type == 'recall': return scores.recall, None else: raise ValueError(f"Invalid score type: {self.score_type}")
def cache_stats(self, ref, out): """ Cache sufficient statistics for caculating BLEU score Args: ref: A reference corpus out: An output corpus Returns: A tuple of cached statistics """ if self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) cached_ref_len = [] cached_out_len = [] cached_prec = [] for r, o in zip(ref, out): cached_ref_len.append(len(r)) cached_out_len.append(len(o)) prec = [] for n in range(1, len(self.weights) + 1): prec.append(self._precision(r, o, n)) cached_prec.append(prec) return (cached_ref_len, cached_out_len, cached_prec)
def __init__(self, freq_counts=None, freq_count_file=None, freq_corpus_file=None, freq_data=None, bucket_cutoffs=None, case_insensitive=False): """ A bucketer that buckets words by their frequency. Args: freq_counts: A dictionary containing word/count data. freq_count_file: A file containing counts for each word in tab-separated word, count format. Ignored if freq_counts exists. freq_corpus_file: A file with a corpus used for collecting counts. Ignored if freq_count_file exists. freq_data: A tokenized corpus from which counts can be calculated. Ignored if freq_corpus_file exists. bucket_cutoffs: Cutoffs for each bucket. The first bucket will be range(0,bucket_cutoffs[0]). Middle buckets will be range(bucket_cutoffs[i],bucket_cutoffs[i-1]. Final bucket will be everything greater than bucket_cutoffs[-1]. case_insensitive: A boolean specifying whether to turn on the case insensitive option. """ self.case_insensitive = case_insensitive if not freq_counts: freq_counts = defaultdict(lambda: 0) if freq_count_file != None: print(f'Reading frequency from "{freq_count_file}"') with open(freq_count_file, "r") as f: for line in f: word, freq = line.strip().split('\t') if self.case_insensitive: freq_counts[corpus_utils.lower(word)] += freq else: freq_counts[word] = freq elif freq_corpus_file: print(f'Reading frequency from "{freq_corpus_file}"') for words in corpus_utils.iterate_tokens(freq_corpus_file): for word in words: if self.case_insensitive: freq_counts[corpus_utils.lower(word)] += 1 else: freq_counts[word] += 1 elif freq_data: print('Reading frequency from the reference') for words in freq_data: for word in words: if self.case_insensitive: freq_counts[corpus_utils.lower(word)] += 1 else: freq_counts[word] += 1 else: raise ValueError( 'Must have at least one source of frequency counts for FreqWordBucketer' ) self.freq_counts = freq_counts if bucket_cutoffs is None: bucket_cutoffs = [1, 2, 3, 4, 5, 10, 100, 1000] self.set_bucket_cutoffs(bucket_cutoffs)
def calc_bucket(self, val, ref=None, src=None, label=None): if self.case_insensitive: return self.cutoff_into_bucket( self.scorer.score_sentence(corpus_utils.lower(ref), corpus_utils.lower(val))[0]) else: return self.cutoff_into_bucket( self.scorer.score_sentence(ref, val, src)[0])
def calc_source_bucketed_matches(self, src, ref, out, ref_aligns, out_aligns, src_labels=None): """ Calculate the number of matches, bucketed by the type of word we have This must be used with a subclass that has self.bucket_strs defined, and self.calc_bucket(word) implemented. Args: src: The source corpus ref: The reference corpus out: The output corpus ref_aligns: Alignments of the reference corpus out_aligns: Alignments of the output corpus src_labels: Labels of the source corpus (optional) Returns: A tuple containing: both_tot: the frequency of a particular bucket appearing in both output and reference ref_tot: the frequency of a particular bucket appearing in just reference out_tot: the frequency of a particular bucket appearing in just output rec: recall of the bucket prec: precision of the bucket fmeas: f1-measure of the bucket """ if not hasattr(self, 'case_insensitive'): self.case_insensitive = False src_labels = src_labels if src_labels else [] matches = [[0, 0, 0] for x in self.bucket_strs] for src_sent, ref_sent, out_sent, ref_align, out_align, src_lab in itertools.zip_longest(src, ref, out, ref_aligns, out_aligns, src_labels): ref_cnt = defaultdict(lambda: 0) for i, word in enumerate(ref_sent): if self.case_insensitive: word = corpus_utils.lower(word) ref_cnt[word] += 1 for i, (src_index, trg_index) in enumerate(out_align): src_word = src_sent[src_index] word = out_sent[trg_index] if self.case_insensitive: word = corpus_utils.lower(word) bucket = self.calc_bucket(src_word, label=src_lab[src_index] if src_lab else None) if ref_cnt[word] > 0: ref_cnt[word] -= 1 matches[bucket][0] += 1 matches[bucket][2] += 1 for i, (src_index, trg_index) in enumerate(ref_align): src_word = src_sent[src_index] bucket = self.calc_bucket(src_word, label=src_lab[src_index] if src_lab else None) matches[bucket][1] += 1 for both_tot, ref_tot, out_tot in matches: if both_tot == 0: rec, prec, fmeas = 0.0, 0.0, 0.0 else: rec = both_tot / float(ref_tot) prec = both_tot / float(out_tot) fmeas = 2 * prec * rec / (prec + rec) yield both_tot, ref_tot, out_tot, rec, prec, fmeas
def score_corpus(self, ref, out): if self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) bleu_object = sacrebleu.corpus_bleu([" ".join(x) for x in out], [[" ".join(x) for x in ref]]) return bleu_object.score, None
def ngram_context_align(ref, out, order=-1, case_insensitive=False): """ Calculate the word alignment between a reference sentence and an output sentence. Proposed in the following paper: Automatic Evaluation of Translation Quality for Distant Language Pairs Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh, Hajime Tsukada http://www.anthology.aclweb.org/D/D10/D10-1092.pdf Args: ref: A reference sentence out: An output sentence order: The highest order of grams we want to consider (-1=inf) case_insensitive: A boolean specifying whether to turn on the case insensitive option Returns: The word alignment, represented as a list of integers. """ if case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) order = len(ref) if order == -1 else order ref_gram_pos = _count_ngram(ref, order) out_gram_pos = _count_ngram(out, order) worder = [] for i, word in enumerate(out): if len(ref_gram_pos[1][word]) == 0: continue if len(ref_gram_pos[1][word]) == len(out_gram_pos[1][word]) == 1: worder.append(ref_gram_pos[1][word][0]) else: word_forward = word word_backward = word for j in range(1, order): if i - j >= 0: word_backward = out[i - j] + ' ' + word_backward if len(ref_gram_pos[j + 1][word_backward]) == len( out_gram_pos[j + 1][word_backward]) == 1: worder.append(ref_gram_pos[j + 1][word_backward][0] + j) break if i + j < len(out): word_forward = word_forward + ' ' + out[i + j] if len(ref_gram_pos[j + 1][word_forward]) == len( out_gram_pos[j + 1][word_forward]) == 1: worder.append(ref_gram_pos[j + 1][word_forward][0]) break return worder
def score_corpus(self, ref, out): """ Score a corpus using ChrF score Args: ref: A reference corpus out: An output corpus Returns: A tuple containing a single value for the ChrF score and a string summarizing auxiliary information """ if self.case_insensitive: chrf = self.chrf_score([[corpus_utils.lower(x)] for x in ref], corpus_utils.lower(out)) else: chrf = self.chrf_score([[x] for x in ref], out) return chrf, None
def score_sentence(self, ref, out): """ Score a single sentence with sentence-level smoothed BLEU score Args: ref: A reference sentence out: An output sentence Returns: The sentence-level BLEU score, and None """ chencherry = nltk.translate.bleu_score.SmoothingFunction() if self.case_insensitive: return nltk.translate.bleu_score.sentence_bleu([corpus_utils.lower(ref)], corpus_utils.lower(out), smoothing_function=chencherry.method2), None else: return nltk.translate.bleu_score.sentence_bleu([ref], out, smoothing_function=chencherry.method2), None
def cache_stats(self, ref, out, src=None): """ Cache sufficient statistics for caculating SacreBLEU score Args: ref: A reference corpus out: An output corpus src: A source courpus. Ignored if passed Returns: A list of cached statistics """ if self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) ref = [' '.join(x) for x in ref] out = [' '.join(x) for x in out] return self.bleu._extract_corpus_statistics(out, [ref])
def calc_bucket(self, word, ref_label=None, out_label=None, src_label=None): if self.case_insensitive: return self.cutoff_into_bucket( self.freq_counts.get(corpus_utils.lower(word), 0)) else: return self.cutoff_into_bucket(self.freq_counts.get(word, 0))
def cache_stats(self, ref, out): """ Cache sufficient statistics for caculating scores Args: ref: A reference corpus out: An output corpus Returns: A tuple of cached statistics """ if hasattr(self, 'case_insensitive') and self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) cached_scores = [] for r, o in zip(ref, out): cached_scores.append(self.score_sentence(r, o)[0]) return cached_scores
def cache_stats(self, ref, out): """ Cache sufficient statistics for caculating SacreBLEU score Args: ref: A reference corpus out: An output corpus Returns: A list of cached statistics """ if self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) cached_stats = [] for r, o in zip(ref, out): re = sacrebleu.corpus_bleu(" ".join(o), " ".join(r)) cached_stats.append( (re.counts, re.totals, re.sys_len, re.ref_len) ) return cached_stats
def score_sentence(self, ref, out, src=None): if self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) if self._stemmer: ref = [self._stemmer.stem(x) if len(x) > 3 else x for x in ref] out = [self._stemmer.stem(x) if len(x) > 3 else x for x in out] if self.rouge_type == 'rougeL': ref, out = self.tokenize(" ".join(ref)), self.tokenize( " ".join(out)) scores = rouge_scorer._score_lcs(ref, out) elif self.rouge_type == 'rougeLsum': refs = [self.tokenize(s) for s in self.get_sents(ref)] outs = [self.tokenize(s) for s in self.get_sents(out)] scores = rouge_scorer._summary_level_lcs(refs, outs) elif re.match(r"rouge[0-9]$", self.rouge_type): ref, out = self.tokenize(" ".join(ref)), self.tokenize( " ".join(out)) n = int(self.rouge_type[5:]) if n <= 0: raise ValueError( f"rougen requires positive n: {self.rouge_type}") ref_ngrams = rouge_scorer._create_ngrams(ref, n) out_ngrams = rouge_scorer._create_ngrams(out, n) scores = rouge_scorer._score_ngrams(ref_ngrams, out_ngrams) else: raise ValueError(f"Invalid rouge type: {self.rouge_type}") if self.score_type == 'fmeasure': score_value = scores.fmeasure elif self.score_type == 'precision': score_value = scores.precision elif self.score_type == 'recall': score_value = scores.recall else: raise ValueError(f"Invalid score type: {self.score_type}") return self.scale * score_value, None
def cache_stats(self, ref, out, src=None): """ Cache sufficient statistics for caculating scores Args: ref: A reference corpus out: An output corpus src: A source corpus. Might be ignored or required depending on the metric Returns: A tuple of cached statistics """ if hasattr(self, 'case_insensitive') and self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) cached_scores = [] src = [None for _ in ref] if src is None else src for r, o, s in zip(ref, out, src): cached_scores.append(self.score_sentence(r, o, s)[0]) return cached_scores
def score_sentence(self, ref, out): """ Score a single sentence with WER Args: ref: A reference sentence out: An output sentence Returns: The WER, and None """ if self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) sp1 = len(ref)+1 tp1 = len(out)+1 scores = np.zeros((sp1, tp1)) equals = (np.expand_dims(np.array(ref), axis=1) == np.array(out)) scores[:,0] = range(sp1) scores[0,:] = range(tp1) # Forward edit distance for i in range(0, len(ref)): for j in range(0, len(out)): my_action = 0 if equals[i,j] else 1 my_score = scores[i,j] + my_action * self.sub_pen del_score = scores[i,j+1] + self.del_pen if del_score < my_score: my_score = del_score ins_score = scores[i+1,j] + self.ins_pen if ins_score < my_score: my_score = ins_score scores[i+1,j+1] = my_score return scores[-1,-1], None
def calc_bucketed_likelihoods(self, corpus, likelihoods): """ Calculate the average of log likelihoods, bucketed by the type of word/label we have This must be used with a subclass that has self.bucket_strs defined, and self.calc_bucket(word) implemented. Args: corpus: The text/label corpus over which we compute the likelihoods likelihoods: The log-likelihoods corresponding to each word/label in the corpus Returns: the average log-likelihood bucketed by the type of word/label we have """ if not hasattr(self, 'case_insensitive'): self.case_insensitive = False if type(corpus) == str: corpus = corpus_utils.load_tokens(corpus) bucketed_likelihoods = [[0.0, 0] for _ in self.bucket_strs] if len(corpus) != len(likelihoods): raise ValueError( "Corpus and likelihoods should have the same size.") for sent, list_of_likelihoods in zip(corpus, likelihoods): if len(sent) != len(list_of_likelihoods): raise ValueError( "Each sentence of the corpus should have likelihood value for each word" ) for word, ll in zip(sent, list_of_likelihoods): if self.case_insensitive: word = corpus_utils.lower(word) bucket = self.calc_bucket(word, label=word) bucketed_likelihoods[bucket][0] += ll bucketed_likelihoods[bucket][1] += 1 for ll, count in bucketed_likelihoods: if count != 0: yield ll / float(count) else: yield "NA" # not applicable
def generate_ngram_report(ref, outs, min_ngram_length=1, max_ngram_length=4, report_length=50, alpha=1.0, compare_type='match', ref_labels=None, out_labels=None, compare_directions='0-1', case_insensitive=False): """ Generate a report comparing aggregate n-gram statistics in both plain text and graphs Args: ref: Tokens from the reference outs: Tokens from the output file(s) min_ngram_length: minimum n-gram length max_ngram_length: maximum n-gram length report_length: the number of n-grams to report alpha: when sorting n-grams for salient features, the smoothing coefficient. A higher smoothing coefficient will result in more frequent phenomena (sometimes this is good). compare_type: what type of statistic to compare (match: n-grams that match the reference, over: over-produced ngrams, under: under-produced ngrams) ref_labels: either a filename of a file full of reference labels, or a list of strings corresponding to `ref`. If specified, will aggregate statistics over labels instead of n-grams. out_labels: output labels. must be specified if ref_labels is specified. compare_directions: A string specifying which systems to compare case_insensitive: A boolean specifying whether to turn on the case insensitive option """ min_ngram_length, max_ngram_length, report_length = int(min_ngram_length), int(max_ngram_length), int(report_length) alpha = float(alpha) case_insensitive = True if case_insensitive == 'True' else False if out_labels is not None: out_labels = arg_utils.parse_files(out_labels) if len(out_labels) != len(outs): raise ValueError(f'The number of output files should be equal to the number of output labels.') if type(ref_labels) == str: label_files_str = f' ref_labels={ref_labels},' for i, out_label in enumerate(out_labels): label_files_str += f' out{i}_labels={out_label},' label_files = (label_files_str) else: label_files = None if type(alpha) == str: alpha = float(alpha) if not type(ref_labels) == str and case_insensitive: ref = corpus_utils.lower(ref) outs = [corpus_utils.lower(out) for out in outs] ref_labels = corpus_utils.load_tokens(ref_labels) if type(ref_labels) == str else ref_labels out_labels = [corpus_utils.load_tokens(out_labels[i]) if not out_labels is None else None for i in range(len(outs))] totals, matches, overs, unders = zip(*[ngram_utils.compare_ngrams(ref, out, ref_labels=ref_labels, out_labels=out_label, min_length=min_ngram_length, max_length=max_ngram_length) for out, out_label in zip(outs, out_labels)]) direcs = arg_utils.parse_compare_directions(compare_directions) scores = [] for (left, right) in direcs: if compare_type == 'match': scores.append(stat_utils.extract_salient_features(matches[left], matches[right], alpha=alpha)) elif compare_type == 'over': scores.append(stat_utils.extract_salient_features(overs[left], overs[right], alpha=alpha)) elif compare_type == 'under': scores.append(stat_utils.extract_salient_features(unders[left], unders[right], alpha=alpha)) else: raise ValueError(f'Illegal compare_type "{compare_type}"') scorelist = [sorted(score.items(), key=operator.itemgetter(1), reverse=True) for score in scores] reporter = reporters.NgramReport(scorelist=scorelist, report_length=report_length, min_ngram_length=min_ngram_length, max_ngram_length=max_ngram_length, matches=matches, compare_type=compare_type, alpha=alpha, compare_directions=direcs, label_files=label_files) reporter.generate_report(output_fig_file=f'ngram-min{min_ngram_length}-max{max_ngram_length}-{compare_type}', output_fig_format='pdf', output_directory='outputs') return reporter
def calc_bucket(self, word, label=None): if self.case_insensitive: word = corpus_utils.lower(word) return self.cutoff_into_bucket(self.freq_counts.get(word, 0))
def calc_bucketed_matches(self, ref, out, ref_labels=None, out_labels=None): """ Calculate the number of matches, bucketed by the type of word we have This must be used with a subclass that has self.bucket_strs defined, and self.calc_bucket(word) implemented. Args: ref: The reference corpus out: The output corpus ref_labels: Labels of the reference corpus (optional) out_labels: Labels of the output corpus (should be specified iff ref_labels is) Returns: A tuple containing: both_tot: the frequency of a particular bucket appearing in both output and reference ref_tot: the frequency of a particular bucket appearing in just reference out_tot: the frequency of a particular bucket appearing in just output rec: recall of the bucket prec: precision of the bucket fmeas: f1-measure of the bucket """ if not hasattr(self, 'case_insensitive'): self.case_insensitive = False ref_labels = ref_labels if ref_labels else [] out_labels = out_labels if out_labels else [] matches = [[0, 0, 0] for x in self.bucket_strs] for ref_sent, out_sent, ref_lab, out_lab in itertools.zip_longest( ref, out, ref_labels, out_labels): ref_pos = defaultdict(lambda: []) for i, word in enumerate(ref_sent): if self.case_insensitive: word = corpus_utils.lower(word) ref_pos[word].append(i) for i, word in enumerate(out_sent): if self.case_insensitive: word = corpus_utils.lower(word) if len(ref_pos[word]) > 0: ri = ref_pos[word][0] ref_pos[word] = ref_pos[word][1:] bucket = self.calc_bucket( word, ref_label=ref_lab[ri] if ref_lab else None, out_label=out_lab[i] if out_lab else None) matches[bucket][0] += 1 matches[bucket][1] += 1 else: bucket = self.calc_bucket( word, out_label=out_lab[i] if out_lab else None) matches[bucket][2] += 1 for word, my_pos in ref_pos.items(): if len(my_pos) > 0: for ri in my_pos: bucket = self.calc_bucket( ref_sent[ri], ref_label=ref_lab[ri] if ref_lab else None) matches[bucket][1] += 1 for both_tot, ref_tot, out_tot in matches: if both_tot == 0: rec, prec, fmeas = 0.0, 0.0, 0.0 else: rec = both_tot / float(ref_tot) prec = both_tot / float(out_tot) fmeas = 2 * prec * rec / (prec + rec) yield both_tot, ref_tot, out_tot, rec, prec, fmeas