def _statistics_func(self, samples, sys_info: SysOutputInfo): vocab: dict[str, float] = {} length_fre: dict[int, float] = {} total_samps = 0 tokenizer = unwrap(sys_info.source_tokenizer) for sample in progress(samples): text = sample["text"] tokens = tokenizer(text) length = len(tokens) length_fre[length] = length_fre.get(length, 0.0) + 1.0 # update vocabulary for w in tokens: vocab[w] = vocab.get(w, 0.0) + 1.0 total_samps += 1 # the rank of each word based on its frequency sorted_dict = { key: rank for rank, key in enumerate( sorted(set(vocab.values()), reverse=True), 1) } vocab_rank = {k: sorted_dict[v] for k, v in vocab.items()} for k, v in length_fre.items(): length_fre[k] = v * 1.0 / total_samps return { "vocab": vocab, "vocab_rank": vocab_rank, "length_fre": length_fre }
def _complete_features(self, sys_info: SysOutputInfo, sys_output: list[dict], external_stats=None) -> list[str]: """ This function takes in meta-data about system outputs, system outputs, and a few other optional pieces of information, then calculates feature functions and modifies `sys_output` to add these feature values :param sys_info: Information about the system output :param sys_output: The system output itself :param external_stats: Training set statistics that are used to calculate training set specific features :return: The features that are active (e.g. skipping training set features when no training set available) """ sys_features = unwrap(sys_info.features) active_features = list( sys_features.get_bucket_features( include_training_dependent=external_stats is not None)) # One pass over the test set to find token test frequency all_tokens = [ unwrap(sys_info.source_tokenizer)(x['output']) for x in sys_output ] all_log_probs = [self._get_predicted_label(x) for x in sys_output] test_freq: dict[str, int] = {} for tokens in all_tokens: for tok in tokens: test_freq[tok] = test_freq.get(tok, 0) + 1 sent_feats: list[str] = [] tok_feats: list[str] = [] for x in active_features: (sent_feats if (x in sys_features) else tok_feats).append(x) for _id, (dict_sysout, tokens, log_probs) in progress( enumerate(zip(sys_output, all_tokens, all_log_probs)), desc="featurizing"): # Get values of bucketing features text = dict_sysout["output"] # sentence_length dict_sysout["text_length"] = len(tokens) dict_sysout["text_chars"] = len(text) # sentence-level training set dependent features if external_stats is not None: dict_sysout["num_oov"] = self._get_num_oov( tokens, external_stats) dict_sysout["fre_rank"] = self._get_fre_rank( tokens, external_stats) # span features for true and predicted spans dict_sysout["tok_info"] = self._complete_tok_features( tokens, log_probs, test_freq, statistics=external_stats) return active_features
def _complete_features(self, sys_info: SysOutputInfo, sys_output: list[dict], external_stats=None) -> list[str]: """ This function takes in meta-data about system outputs, system outputs, and a few other optional pieces of information, then calculates feature functions and modifies `sys_output` to add these feature values :param sys_info: Information about the system output :param sys_output: The system output itself :param external_stats: Training set statistics that are used to calculate training set specific features :return: The features that are active (e.g. skipping training set features when no training set available) """ sys_features = unwrap(sys_info.features) active_features = list( sys_features.get_bucket_features( include_training_dependent=external_stats is not None)) sent_feats: list[str] = [] tok_feats: list[str] = [] for x in active_features: (sent_feats if (x in sys_features) else tok_feats).append(x) for _id, dict_sysout in progress(enumerate(sys_output), desc="featurizing"): # Get values of bucketing features tokens = dict_sysout["tokens"] # sentence_length dict_sysout["sentence_length"] = len(tokens) # entity density dict_sysout["span_density"] = len( self._span_ops.get_spans_simple( tags=dict_sysout["true_tags"])) / len(tokens) # sentence-level training set dependent features if external_stats is not None: dict_sysout["num_oov"] = self._get_num_oov( tokens, external_stats) dict_sysout["fre_rank"] = self._get_fre_rank( tokens, external_stats) # span features for true and predicted spans dict_sysout["span_info"] = self._complete_span_features( tokens, dict_sysout["true_tags"], dict_sysout["pred_tags"], statistics=external_stats, ) # This is not used elsewhere, so just keep it as-is return active_features
def get_econ_efre_dic( self, words: list[str], bio_tags: list[str]) -> tuple[dict[str, float], dict[str, int]]: """ Calculate the entity label consistency and frequency features from this paper https://aclanthology.org/2020.emnlp-main.489.pdf :param words: a list of all words in the corpus :param bio_tags: a list of all tags in the corpus :return: Returns two dictionaries: econ: 'span|||tag' pointing to entity consistency values efre: 'span' pointing to entity frequency values """ chunks_train = self._span_ops.get_spans_simple(bio_tags) # Create pseudo-trie prefixes: set[str] = set() chunk_to_tag: dict[tuple[int, int], str] = {} entity_to_tagcnt: dict[str, dict[str, int]] = {} efre_dic: dict[str, int] = {} for true_chunk in progress(chunks_train): idx_start = true_chunk[1] idx_end = true_chunk[2] chunk_to_tag[(idx_start, idx_end)] = true_chunk[0] span_str = '' for i in range(0, idx_end - idx_start): w = words[idx_start + i].lower() span_str += w if i == 0 else f' {w}' prefixes.add(span_str) entity_to_tagcnt[span_str] = {} efre_dic[span_str] = efre_dic.get(span_str, 0) + 1 # Actually calculate stats ltws = len(words) for idx_start in range(ltws): span_str = '' for i in range(0, ltws - idx_start): w = words[idx_start + i].lower() span_str += w if i == 0 else f' {w}' if span_str not in prefixes: break if span_str in entity_to_tagcnt: my_tag = chunk_to_tag.get((idx_start, idx_start + i + 1), self._DEFAULT_TAG) entity_to_tagcnt[span_str][my_tag] = ( entity_to_tagcnt[span_str].get(my_tag, 0) + 1) econ_dic: dict[str, float] = {} for span_str, cnt_dic in entity_to_tagcnt.items(): cnt_sum = float(sum(cnt_dic.values())) for tag, cnt in cnt_dic.items(): econ_dic[f'{span_str}|||{tag}'] = cnt / cnt_sum return econ_dic, efre_dic
def _statistics_func(self, samples: Iterator, sys_info: SysOutputInfo): """ `Samples` is a dataset iterator: List[Dict], to know more about it, you can: # pip install datalabs dataset = load_dataset("fb15k_237", 'readable') print(dataset['train']) """ dict_head: dict[str, int] = {} dict_link: dict[str, int] = {} dict_tail: dict[str, int] = {} entity_dic = {} file_path = cache_api.cache_online_file( 'http://phontron.com/download/explainaboard/pre_computed/kg/entity2wikidata.json', # noqa 'pre_computed/kg/entity2wikidata.json', ) with open(file_path, 'r') as file: entity_dic = json.loads(file.read()) for sample in progress(samples): tail = (sample['tail'] if sample['tail'] not in entity_dic.keys() else entity_dic[sample['tail']]['label']) if tail not in dict_tail.keys(): dict_tail[tail] = 1 else: dict_tail[tail] += 1 head = (sample['head'] if sample['head'] not in entity_dic.keys() else entity_dic[sample['head']]['label']) if head not in dict_head.keys(): dict_head[head] = 1 else: dict_head[head] += 1 link = (sample['link'] if sample['link'] not in entity_dic.keys() else entity_dic[sample['link']]['label']) if link not in dict_link.keys(): dict_link[link] = 1 else: dict_link[link] += 1 return { "head_fre": dict_head, "link_fre": dict_link, "tail_fre": dict_tail, }
def bucketing_samples( self, sys_info: SysOutputInfo, sys_output: list[dict], active_features: list[str], metric_stats: list[MetricStats], ) -> dict[str, list[BucketPerformance]]: features = unwrap(sys_info.features) sent_feats: list[str] = [] tok_feats: list[str] = [] for x in active_features: (sent_feats if (x in features) else tok_feats).append(x) # First, get the buckets for sentences using the standard protocol performances_over_bucket = super().bucketing_samples( sys_info, sys_output, sent_feats, metric_stats) # Bucketing feature_lists = self._get_feature_lists(sys_output, tok_feats) for i, feature_name in enumerate( progress(tok_feats, desc="token-level bucketing")): my_feature = features["tok_info"].feature.feature[feature_name] bucket_info = my_feature.bucket_info # Get buckets for true spans bucket_func: Callable[..., list[BucketCaseCollection]] = getattr( bucketing, bucket_info.method) samples_over_bucket = bucket_func( sample_features=feature_lists[i], bucket_number=bucket_info.number, bucket_setting=bucket_info.setting, ) # evaluating bucket: get bucket performance performances_over_bucket[ feature_name] = self.get_bucket_performance_lm( sys_info, sys_output, samples_over_bucket, ) return performances_over_bucket
def accumulate_vocab_from_samples(samples: Iterator, text_from_sample: Callable, tokenizer: Tokenizer): vocab: dict[str, int] = {} for sample in progress(samples): for w in tokenizer(text_from_sample(sample)): vocab[w] = vocab.get(w, 0) + 1 # the rank of each word based on its frequency sorted_dict = { key: rank for rank, key in enumerate(sorted(set(vocab.values()), reverse=True), 1) } vocab_rank = {k: sorted_dict[v] for k, v in vocab.items()} return { "vocab": vocab, "vocab_rank": vocab_rank, }
def bucketing_samples( self, sys_info: SysOutputInfo, sys_output: list[dict], active_features: list[str], metric_stats: list[MetricStats], ) -> dict[str, list[BucketPerformance]]: """ Separate samples into buckets and calculate performance over them :param sys_info: Information about the system output :param sys_output: The system output itself, already annotated with features :param active_features: The features to perform bucketing over :param metric_stats: The stats from which to calculate performance :return: performances_over_bucket: a dictionary of feature name -> list of performances by bucket """ sys_features = unwrap(sys_info.features) # Bucketing performances_over_bucket: dict[str, list[BucketPerformance]] = {} for feature_name in progress(active_features, desc="sample-level bucketing"): # Preparation for bucketing bucket_func: Callable[..., list[BucketCaseCollection]] = getattr( explainaboard.utils.bucketing, sys_features[feature_name].bucket_info.method, ) samples_over_bucket = bucket_func( sample_features=[ (BucketCase(x), sys_output[x][feature_name]) for x in range(len(sys_output)) ], bucket_number=sys_features[feature_name].bucket_info.number, bucket_setting=sys_features[feature_name].bucket_info.setting, ) # evaluating bucket: get bucket performance performances_over_bucket[feature_name] = self.get_bucket_performance( sys_info, sys_output, samples_over_bucket, metric_stats=metric_stats, ) return performances_over_bucket
def _statistics_func(self, samples: Dataset, sys_info: SysOutputInfo): dl_features = samples.info.features tokens_sequences = [] tags_sequences = [] vocab: dict[str, int] = {} tag_vocab: dict[str, int] = {} for sample in progress(samples): rep_sample = DatalabFileLoader.replace_labels(dl_features, sample) tokens, tags = rep_sample["tokens"], rep_sample["tags"] # update vocabulary for token, tag in zip(tokens, tags): vocab[token] = vocab.get(token, 0) + 1 tag_vocab[tag] = tag_vocab.get(tag, 0) + 1 tokens_sequences += tokens tags_sequences += tags # econ and efre dictionaries econ_dic, efre_dic = self.get_econ_efre_dic(tokens_sequences, tags_sequences) # vocab_rank: the rank of each word based on its frequency sorted_dict = { key: rank for rank, key in enumerate( sorted(set(vocab.values()), reverse=True), 1) } vocab_rank = {k: sorted_dict[v] for k, v in vocab.items()} return { "efre_dic": efre_dic, "econ_dic": econ_dic, "vocab": vocab, "vocab_rank": vocab_rank, }
def bucketing_samples( self, sys_info: SysOutputInfo, sys_output: list[dict], active_features: list[str], metric_stats: list[MetricStats], ) -> dict[str, list[BucketPerformance]]: features = unwrap(sys_info.features) sent_feats: list[str] = [] tok_feats: list[str] = [] for x in active_features: (sent_feats if (x in features) else tok_feats).append(x) # First, get the buckets for sentences using the standard protocol performances_over_bucket = super().bucketing_samples( sys_info, sys_output, sent_feats, metric_stats) all_sample_features = self._get_sample_features(sys_output, tok_feats) # Second, get the buckets for tokens for feature_id, feature_name in enumerate( progress(tok_feats, desc="bucketing token features")): # Choose behavior based on whether this is a feature of samples or spans my_feature = features["ref_tok_info"].feature.feature[feature_name] bucket_info = my_feature.bucket_info # Get buckets for true spans bucket_func: Callable[..., list[BucketCaseCollection]] = getattr( bucketing, bucket_info.method) sample_features = [(case, feats[feature_id]) for case, feats in all_sample_features] samples_over_bucket = bucket_func(
def _complete_features( self, sys_info: SysOutputInfo, sys_output: list[dict], external_stats=None ) -> list[str]: """ This function takes in meta-data about system outputs, system outputs, and a few other optional pieces of information, then calculates feature functions and modifies `sys_output` to add these feature values :param sys_info: Information about the system output :param sys_output: The system output itself :param external_stats: External statistics that are used to calculate training set specific features :return: The features that are active (e.g. skipping training set features when no training set available) """ bucket_feature_funcs: dict[str, tuple[Callable, bool]] = {} sys_features = unwrap(sys_info.features) for bucket_feature in sys_features.get_bucket_features(): feature_info = sys_features[bucket_feature] # Skip training set features if no stats if external_stats is None and feature_info.require_training_set: continue feature_func = self._get_feature_func( bucket_feature, feature_info.is_custom ) bucket_feature_funcs[bucket_feature] = ( feature_func, feature_info.require_training_set, ) for _id, dict_sysout in progress(enumerate(sys_output), desc="featurizing"): # Get values of bucketing features for ( bucket_key, ( bucket_func, training_dependent, ), ) in bucket_feature_funcs.items(): feature_info = sys_features[bucket_key] # handles user-defined features if feature_info.is_custom: # TODO(Pengfei): this should be generalized feature_value = ( "_".join(dict_sysout[bucket_key]) if isinstance(dict_sysout[bucket_key], list) else dict_sysout[bucket_key] ) dict_sysout[bucket_key] = feature_value # handles all other features else: dict_sysout[bucket_key] = ( bucket_func(sys_info, dict_sysout, external_stats) if training_dependent else bucket_func(sys_info, dict_sysout) ) return list(bucket_feature_funcs.keys())
def draw_bar_chart_from_reports(reports: list[str], output_dir: str, sys_names: list[str] | None = None) -> None: """ Draw bar charts from report file generated from ExplainaBoard :param reports: Reports to plot :param output_dir: :return: """ # TODO(gneubig): This should get the system name from inside the report if sys_names is None: sys_names = [os.path.basename(x).replace('.json', '') for x in reports] elif len(sys_names) != len(reports): raise ValueError('Length of sys_names must equal that of reports') report_info: list[SysOutputInfo] = [] for report in reports: with open(report) as fin: report_info.append(SysOutputInfo.from_dict(json.load(fin))) overall_results = [ list(unwrap(x.results.overall).values()) for x in report_info ] overall_metric_names = list(unwrap(report_info[0].results.overall).keys()) fg_results = [unwrap(x.results.fine_grained) for x in report_info] if not os.path.exists(output_dir): os.makedirs(output_dir) # Overall performance ys = [[x.value for x in y] for y in overall_results] y_errs = None if overall_results[0][0].confidence_score_low is not None: y_errs = [( [x.value - unwrap(x.confidence_score_low) for x in y], [unwrap(x.confidence_score_high) - x.value for x in y], ) for y in overall_results] make_bar_chart( ys, output_dir, 'overall', output_fig_format='png', fig_size=(8, 6), sys_names=sys_names, errs=y_errs, title=None, xticklabels=overall_metric_names, ylabel='metric value', ) # Bucket performance: feature name, for example, sentence length for feature_name in progress(fg_results[0].keys()): # Make sure that buckets exist buckets: list[list[BucketPerformance]] = [] for i, fg_result in enumerate(fg_results): if feature_name not in fg_result: get_logger().error( f'error: feature {feature_name} not in {reports[i]}') else: buckets.append(fg_result[feature_name]) bnames0, bnames = [x.bucket_interval for x in buckets[0] ], [x.bucket_interval for x in buckets[-1]] if len(bnames0) != len(bnames): get_logger().error( f'error: different number of buckets for {feature_name} in ' f'{reports[0]} and {reports[i]}') buckets = [] elif bnames0 != bnames: get_logger().warning( f'warning: different bucket labels for {feature_name} in ' f'{reports[0]} and {reports[i]}') if len(buckets) != i + 1: break if len(buckets) != len(reports): continue bucket0_intervals = [x.bucket_interval for x in buckets[0]] bucket_metrics = [x.metric_name for x in buckets[0][0].performances] for metric_id, metric_name in enumerate(bucket_metrics): performances: list[list[Performance]] = [ [x.performances[metric_id] for x in y] for y in buckets ] ys = [[x.value for x in y] for y in performances] y_errs = None if performances[0][0].confidence_score_low is not None: y_errs = [( [x.value - unwrap(x.confidence_score_low) for x in y], [unwrap(x.confidence_score_high) - x.value for x in y], ) for y in performances] make_bar_chart( ys, output_dir, f'{feature_name}_{metric_name}', output_fig_format='png', fig_size=(8, 6), sys_names=sys_names, errs=y_errs, title=None, xlabel=feature_name, xticklabels=bucket0_intervals, ylabel=metric_name, )
def bucketing_samples( self, sys_info: SysOutputInfo, sys_output: list[dict], active_features: list[str], metric_stats: list[MetricStats], ) -> dict[str, list[BucketPerformance]]: features = unwrap(sys_info.features) sent_feats: list[str] = [] span_feats: list[str] = [] for x in active_features: (sent_feats if (x in features) else span_feats).append(x) # First, get the buckets for sentences using the standard protocol performances_over_bucket = super().bucketing_samples( sys_info, sys_output, sent_feats, metric_stats) case_spans: list[tuple[BucketCaseLabeledSpan, Span]] = [] for sample_id, my_output in enumerate(sys_output): for tok_id, span_info in enumerate(my_output['span_info']): span = cast(Span, span_info) true_tag, pred_tag = unwrap(span.span_tag).split(' ') case_spans.append(( BucketCaseLabeledSpan( sample_id=sample_id, token_span=unwrap(span.span_pos), char_span=unwrap(span.span_char_pos), orig_str='tokens', text=unwrap(span.span_text), true_label=true_tag, predicted_label=pred_tag, ), span, )) # Bucketing for feature_name in progress(span_feats, desc="span-level bucketing"): my_feature = features["true_span_info"].feature.feature[ feature_name] bucket_info = my_feature.bucket_info # Get buckets for true spans bucket_func: Callable[..., list[BucketCaseCollection]] = getattr( bucketing, bucket_info.method) # Span tag is special because we keep track of both labels, keep just gold if feature_name == 'span_tag': sample_features = [(case, unwrap(span.span_tag).split(' ')[0]) for case, span in case_spans] else: sample_features = [(case, getattr(span, feature_name)) for case, span in case_spans] samples_over_bucket = bucket_func( sample_features=sample_features, bucket_number=bucket_info.number, bucket_setting=bucket_info.setting, ) # evaluating bucket: get bucket performance performances_over_bucket[ feature_name] = self.get_bucket_performance_seqlab( sys_info, sys_output, samples_over_bucket, ) return performances_over_bucket