def get_all_fscores(self): n_correct = len(self.correct) true_entities = get_entities(self.y_true) pred_entities = get_entities(self.y_pred) n_true = len(true_entities) n_pred = len(pred_entities) p = (n_correct ) / n_pred if n_pred > 0 else 0 r = n_correct / n_true if n_true > 0 else 0 exact_f_score = 2 * p * r / (p + r) if p + r > 0 else 0 p = (n_correct + len((self.right_label_over_span)) ) / n_pred if n_pred > 0 else 0 r = (n_correct + len((self.right_label_over_span)) )/ n_true if n_true > 0 else 0 relaxed_f_score = 2 * p * r / (p + r) if p + r > 0 else 0 overlap_pred_score = self.get_overlap_score() p = (n_correct + overlap_pred_score) / n_pred if n_pred > 0 else 0 r = (n_correct + overlap_pred_score) / n_true if n_true > 0 else 0 user_exp_f_score = 2 * p * r / (p + r) if p + r > 0 else 0 exact_f_score, relaxed_f_score, user_exp_f_score return exact_f_score, user_exp_f_score,relaxed_f_score
def get_error_types(y_true,y_pred): true_entities = get_entities(y_true) pred_entities = get_entities(y_pred) correct = set(true_entities)&set(pred_entities) true_entities_rest = set(true_entities)-correct pred_entities_rest = set(pred_entities)-correct right_label_overlapping_span = [] wrong_label_overlapping_span = [] wrong_label_right_span= [] complete_false_positive = [] complete_false_negative = [] for true_entity in list(true_entities_rest): for pred_entity in list(pred_entities_rest): overlap = get_overlap(true_entity, pred_entity) if len(overlap)>0: if true_entity[0]==pred_entity[0]: right_label_overlapping_span.append((true_entity, pred_entity)) elif (true_entity[1]==pred_entity[1]) & (true_entity[2]==pred_entity[2]): wrong_label_right_span.append((true_entity, pred_entity)) else: wrong_label_overlapping_span.append((true_entity, pred_entity)) complete_false_positive = pred_entities_rest - set([item[1] for item in right_label_overlapping_span])-\ set([item[1] for item in wrong_label_overlapping_span])-set([item[1] for item in wrong_label_right_span]) complete_false_negative = true_entities_rest - set([item[0] for item in right_label_overlapping_span])-\ set([item[0] for item in wrong_label_overlapping_span])-set([item[0] for item in wrong_label_right_span]) return correct, right_label_overlapping_span, wrong_label_overlapping_span, wrong_label_right_span, complete_false_positive, complete_false_negative
def recall_score_span(y_true, y_pred, average='micro', suffix=False): """Compute the recall. The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The best value is 1 and the worst value is 0. Args: y_true : 2d array. Ground truth (correct) target values. y_pred : 2d array. Estimated targets as returned by a tagger. Returns: score : float. Example: >>> from seqeval.metrics import recall_score >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> recall_score(y_true, y_pred) 0.50 """ true_entities = set([(y, z) for x, y, z in get_entities(y_true, suffix)]) pred_entities = set([(y, z) for x, y, z in get_entities(y_pred, suffix)]) nb_correct = len(true_entities & pred_entities) nb_true = len(true_entities) score = nb_correct / nb_true if nb_true > 0 else 0 return score
def test_get_entities_with_non_NE_input(self): y_true = ['O', 'O', 'O', 'MISC', 'MISC', 'MISC', 'O', 'PER', 'PER'] with self.assertWarns(UserWarning): get_entities(y_true) with self.assertWarns(UserWarning): get_entities(y_true, suffix=True)
def evaluate_term_multi_token(test_data, all_preds_bio): y_pred = [] for row in all_preds_bio: for tag in row: y_pred.append(tag) texts_idxes = [] y_gold = [] tokens_list = [] i = 0 for line in test_data: for gold_bio, token in zip(line['tags'], line['tokens']): y_gold.append(gold_bio) tokens_list.append(token) texts_idxes.append(i) i = i + 1 gold_entities = set(get_entities(y_gold)) pred_entities = set(get_entities(y_pred)) d1 = defaultdict(set) d2 = defaultdict(set) for e in gold_entities: d1[e[0]].add((e[1], e[2])) for e in pred_entities: d2[e[0]].add((e[1], e[2])) err_analysis_list = [] for type_name, gold_entities in d1.items(): pred_entities = d2[type_name] for pred_ent in pred_entities: if pred_ent in gold_entities: eval_type = 'TP' else: eval_type = 'FP' tokens = ' '.join(tokens_list[pred_ent[0]:pred_ent[-1] + 1]) text = test_data[texts_idxes[pred_ent[0]]]['text'] err_analysis_list.append({ 'Eval_type': eval_type, 'Term': tokens, 'Text': text }) for gold_ent in gold_entities: if gold_ent not in pred_entities: eval_type = 'FN' tokens = ' '.join(tokens_list[gold_ent[0]:gold_ent[-1] + 1]) text = test_data[texts_idxes[gold_ent[0]]]['text'] err_analysis_list.append({ 'Eval_type': eval_type, 'Term': tokens, 'Text': text }) return err_analysis_list
def run(file_name_list, mode): import datetime from tqdm import tqdm print(f"------------------start For {mode}----------------------") conlls = [] all_del_sen = 0 all_label_num = 0 for file_name in sorted(file_name_list): txt_name = file_name if mode != "test": ann_name = file_name[:-4] + ".csv" preprocess = PreProcess(txt_name, ann_name) all_label_num += preprocess.ann.shape[0] else: preprocess = PreProcess(txt_name) conll, info = preprocess.brat2conll() all_del_sen += info["del_sen"] conlls.extend(conll) max_len = max([len(s) + 2 for s in conlls]) label_num = 0 for conll in conlls: label = [c[-1] for c in conll] label_num += len(get_entities(label)) print(f''' 句子总数 : {len(conlls)} 删除句子个数: {all_del_sen} 句子最大长度: {max_len} 原始实体个数: {all_label_num} 当前实体个数: {label_num} ''') # 打印lanbel的分布情况 label_dict = {} for sentence in conlls: label = [s[-1] for s in sentence] for entity in get_entities(label): entity = entity[0] label_dict[entity] = label_dict.get(entity, 0) + 1 if mode != "test": print("实体分布情况:") totle_num = 0 for l in LABELS_LIST: if l not in label_dict: label_dict.update({l: 0}) totle_num = sum(label_dict.values()) for k, v in sorted(label_dict.items(), key=lambda x: x[0]): print(f"{k}:\t{v}\t{v/totle_num}") print(f"All: {totle_num}") print("-------------------END----------------------\n") return conlls
def entity_visualization(texts: List[List[str]], labels: List[List[str]], output_fname='entity_texts.html'): texts_c = deepcopy(texts) texts_c = [item[:-1] for item in texts_c] entities = [get_entities(item) for item in labels] all_entities = list( set([sub_item[0] for item in entities for sub_item in item])) all_entities = [item for item in all_entities if item != 'O'] nb_entities = len(all_entities) if nb_entities > len(ENTITY_COLOR): rest_nb_colors = nb_entities - len(ENTITY_COLOR) colors = ENTITY_COLOR + [ '#' + ''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(rest_nb_colors) ] else: colors = ENTITY_COLOR[:nb_entities] assert len(colors) == nb_entities entity_colors = {all_entities[i]: colors[i] for i in range(nb_entities)} with open(output_fname, 'w') as fout: for x, y in zip(texts_c, entities): fout.write(entity2html(x, y, entity_colors))
def restrict_entities(text, tag, pred_prob, threshold=0.85): """Return restricted entities according to tag sequence: only keep at most one entity for each entity type """ group_entities = defaultdict(list) chunks = get_entities(tag) for chunk_type, chunk_start, chunk_end in chunks: chunk_end += 1 score = float(np.average(pred_prob[chunk_start: chunk_end])) if score >= threshold: entity = ''.join(text[chunk_start: chunk_end]) group_entities[chunk_type].append((entity, score, chunk_start, chunk_end)) results = [] for entity_type, group in group_entities.items(): entity = sorted(group, key=lambda x: x[0])[-1] results.append({ 'name': entity[0], 'type': entity_type, 'score': entity[1], 'beginOffset': entity[2], 'endOffset': entity[3] }) return results
def read_examples_from_file(self, file_path) -> List[InputExample]: guid_index = 1 examples = [] with open(file_path, encoding="utf-8") as f: words, labels = [], [] metainfo = None for line in f: line = line.rstrip() if line.startswith("#\tpassage"): metainfo = line elif line == "": if words: prods = get_entities(labels) for etype, ss, se in prods: # create prod-specific instance assert etype == "arm_description" inst_labels = ["O"] * len(words) inst_labels[ss] = "B-arm_description" inst_labels[ss + 1:se + 1] = ["I-arm_description"] * (se - ss) examples.append( InputExample(guid=f"{guid_index}", words=words, metainfo=metainfo, labels=inst_labels)) guid_index += 1 words, labels = [], [] else: cols = line.strip().split('\t') words.append(cols[0]) labels.append(cols[1]) return examples
def call(self, predictions, log_verbose=False): ''' main func entrypoint''' preds = predictions["preds"] output_index = predictions["output_index"] if output_index is None: res_file = self.config["solver"]["postproc"].get("res_file", "") label_path_file = self.config["data"]["task"]["label_vocab"] else: res_file = self.config["solver"]["postproc"][output_index].get( "res_file", "") label_path_file = self.config["data"]["task"]["label_vocab"][ output_index] if res_file == "": logging.info( "Infer res not saved. You can check 'res_file' in your config." ) return res_dir = os.path.dirname(res_file) if not os.path.exists(res_dir): os.makedirs(res_dir) logging.info("Save inference result to: {}".format(res_file)) preds = ids_to_sentences(preds, label_path_file) with open(res_file, "w", encoding="utf-8") as in_f: for i, pre in enumerate(preds): entities = get_entities(pre) # [('PER', 0, 1), ('LOC', 3, 3)] if not entities: in_f.write("Null") else: new_line = "\t".join( [" ".join(map(str, entity)) for entity in entities]) in_f.write(new_line) in_f.write("\n")
def write_outputs_to_json(out_file: str, examples: List[Example], y_preds: List[TAG_SEQUENCE]) -> None: """Writes a JSON with prediction outputs. Args: out_file: path to an output file or '-' to use stdout. examples: list of Example instances with associated tokens. y_preds: list of predicted tag sequences for each example. """ output = [] for example, y_pred in zip(examples, y_preds): predicted_entities = [] for entity in get_entities(y_pred): entity_class, start_token_ix, end_token_ix = entity start_char = example.doc_tokens[start_token_ix].offset end_token = example.doc_tokens[end_token_ix] end_char = end_token.offset + len(end_token) predicted_entities.append({ 'class': entity_class, 'start_char': start_char, 'end_char': end_char, 'text': example.orig_text[start_char:end_char], }) output.append({ 'doc_id': example.doc_id, 'text': example.orig_text, 'entities': predicted_entities, }) with smart_open(out_file) as fd: json.dump(output, fd)
def calc_char_offset(self, words, tags): """ Examples: >>> words = ['EU', 'rejects', 'German', 'call'] >>> tags = ['B-ORG', 'O', 'B-MISC', 'O'] >>> entities = get_entities(tags) >>> entities [['ORG', 0, 0], ['MISC', 2, 2]] >>> self.calc_char_offset(words, tags) { 'text': 'EU rejects German call', 'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']] } """ doc = ' '.join(words) j = {'text': ' '.join(words), 'labels': []} pos = defaultdict(int) for label, start_offset, end_offset in get_entities(tags): entity = ' '.join(words[start_offset:end_offset + 1]) char_left = doc.index(entity, pos[entity]) char_right = char_left + len(entity) span = [char_left, char_right, label] j['labels'].append(span) pos[entity] = char_right return j
def _build_response(self, split_text, tags, poss, segs=[], words=[]): if self.basic_token == 'char': res = { 'words': split_text, 'pos': poss, 'char_pos': poss, 'char_word': words, 'seg': segs, 'entities': [] } else: res = {'words': split_text, 'pos': poss, 'entities': []} chunks = get_entities(tags) for chunk_type, chunk_start, chunk_end in chunks: chunk = self.post_process_chunk(chunk_type, chunk_start, chunk_end, split_text, poss) if chunk is not None: entity = { 'text': chunk, 'type': chunk_type, 'beginOffset': chunk_start, 'endOffset': chunk_end } res['entities'].append(entity) return res
def detailed_metrics(y_gold, y_pred): """Calculate the main classification metrics for every label type. Args: y_gold: 2d array. Ground truth (correct) target values. y_pred: 2d array. Estimated targets as returned by a classifier. digits: int. Number of digits for formatting output floating point values. Returns: type_metrics: dict of label types and their metrics. macro_avg: dict of weighted macro averages for all metrics across label types. """ gold_entities = set(get_entities(y_gold)) pred_entities = set(get_entities(y_pred)) d1 = defaultdict(set) d2 = defaultdict(set) for e in gold_entities: d1[e[0]].add((e[1], e[2])) for e in pred_entities: d2[e[0]].add((e[1], e[2])) metrics = {} ps, rs, f1s, s = [], [], [], [] for type_name, gold_entities in d1.items(): pred_entities = d2[type_name] nb_correct = len(gold_entities & pred_entities) nb_pred = len(pred_entities) nb_true = len(gold_entities) p = nb_correct / nb_pred if nb_pred > 0 else 0 r = nb_correct / nb_true if nb_true > 0 else 0 f1 = 2 * p * r / (p + r) if p + r > 0 else 0 metrics[type_name.lower() + '_precision'] = round(p, 3) metrics[type_name.lower() + '_recall'] = round(r, 3) metrics[type_name.lower() + '_f1'] = round(f1, 3) ps.append(p) rs.append(r) f1s.append(f1) s.append(nb_true) macro_avg = { 'macro_precision': round(np.average(ps, weights=s), 3), 'macro_recall': round(np.average(rs, weights=s), 3), 'macro_f1': round(np.average(f1s, weights=s), 3) } return metrics, macro_avg
def get_metrics(y_true, y_pred, suffix=False): true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) name_width = 0 d1 = defaultdict(set) d2 = defaultdict(set) for e in true_entities: d1[e[0]].add((e[1], e[2])) name_width = max(name_width, len(e[0])) for e in pred_entities: d2[e[0]].add((e[1], e[2])) type_name_list = [] ps, rs, f1s, s = [], [], [], [] for type_name, true_entities in d1.items(): pred_entities = d2[type_name] nb_correct = len(true_entities & pred_entities) nb_pred = len(pred_entities) nb_true = len(true_entities) p = nb_correct / nb_pred if nb_pred > 0 else 0 r = nb_correct / nb_true if nb_true > 0 else 0 f1 = 2 * p * r / (p + r) if p + r > 0 else 0 type_name_list.append(type_name) ps.append(p) rs.append(r) f1s.append(f1) s.append(nb_true) # compute averages type_name_list.append('avg / total') ps.append(np.average(ps, weights=s)) rs.append(np.average(rs, weights=s)) f1s.append(np.average(f1s, weights=s)) s.append(np.sum(s)) df_metrics = pd.DataFrame({ 'type_name': type_name_list, 'precision': ps, 'recall': rs, 'f1-score': f1s, 'support': s }) return df_metrics
def get_tag_dict(sequence, tag_texts): words = sequence.split() entities = get_entities(tag_texts) slots = defaultdict(list) for slot, start_idx, end_idx in entities: slots[slot].append(" ".join(words[start_idx : end_idx + 1])) return dict(slots)
def verifyTestDataBalanceCRF(self, y_test): """check tags (classes) balance from dataset in test""" from seqeval.metrics.sequence_labeling import get_entities lst = [ls for sublist in y_test for ls in sublist] tags = set([tg[0] for tg in get_entities(lst)]) tags = list(tags) tags.sort() print('{}\t{}'.format(len(tags), tags))
def precision_recall_f1_support_sequence_labelling(y_true, y_pred): """Compute precision, recall, f1 and support for sequence labelling tasks. For given gold (`y_true`) and predicted (`y_pred`) sequence labels, returns the precision, recall, f1 and support per label, and the macro and micro average of these scores across labels. Expects `y_true` and `y_pred` to be a sequence of IOB1/2, IOE1/2, or IOBES formatted labels. Args: y_true (list): List of IOB1/2, IOE1/2, or IOBES formatted sequence labels. y_pred (list): List of IOB1/2, IOE1/2, or IOBES formatted sequence labels. Returns: A dictionary of scores keyed by the labels in `y_true` where each score is a 4-tuple containing precision, recall, f1 and support. Additionally includes the keys 'Macro avg' and 'Micro avg' containing the macro and micro averages across scores. """ scores = {} # Unique labels, not including NEG labels = list( {tag.split('-')[-1] for tag in set(y_true) if tag != OUTSIDE}) labels.sort( ) # ensures labels displayed in same order across runs / partitions for label in labels: y_true_lab = [ tag if tag.endswith(label) else OUTSIDE for tag in y_true ] y_pred_lab = [ tag if tag.endswith(label) else OUTSIDE for tag in y_pred ] # TODO (John): Open a pull request to seqeval with a new function that returns all these # scores in one call. There is a lot of repeated computation here. precision = precision_score(y_true_lab, y_pred_lab) recall = recall_score(y_true_lab, y_pred_lab) f1 = f1_score(y_true_lab, y_pred_lab) support = len(set(get_entities(y_true_lab))) scores[label] = precision, recall, f1, support # Get macro and micro performance metrics averages macro_precision = mean([v[0] for v in scores.values()]) macro_recall = mean([v[1] for v in scores.values()]) macro_f1 = mean([v[2] for v in scores.values()]) total_support = sum([v[3] for v in scores.values()]) micro_precision = precision_score(y_true, y_pred) micro_recall = recall_score(y_true, y_pred) micro_f1 = f1_score(y_true, y_pred) scores[ 'Macro avg'] = macro_precision, macro_recall, macro_f1, total_support scores[ 'Micro avg'] = micro_precision, micro_recall, micro_f1, total_support return scores
def decoding(text, tag_seq): assert len(text) == len( tag_seq), f"text len: {len(text)}, tag_seq len: {len(tag_seq)}" puncs = list(",.?;!,。?;!") splits = [idx for idx in range(len(text)) if text[idx] in puncs] prev = 0 sub_texts, sub_tag_seqs = [], [] for i, split in enumerate(splits): sub_tag_seqs.append(tag_seq[prev:split]) sub_texts.append(text[prev:split]) prev = split sub_tag_seqs.append(tag_seq[prev:]) sub_texts.append((text[prev:])) ents_list = [] for sub_text, sub_tag_seq in zip(sub_texts, sub_tag_seqs): ents = get_entities(sub_tag_seq, suffix=False) ents_list.append((sub_text, ents)) aps = [] no_a_words = [] for sub_tag_seq, ent_list in ents_list: sub_aps = [] sub_no_a_words = [] for ent in ent_list: ent_name, start, end = ent if ent_name == "Aspect": aspect = sub_tag_seq[start:end + 1] sub_aps.append([aspect]) if len(sub_no_a_words) > 0: sub_aps[-1].extend(sub_no_a_words) sub_no_a_words.clear() else: ent_name == "Opinion" opinion = sub_tag_seq[start:end + 1] if len(sub_aps) > 0: sub_aps[-1].append(opinion) else: sub_no_a_words.append(opinion) if sub_aps: aps.extend(sub_aps) if len(no_a_words) > 0: aps[-1].extend(no_a_words) no_a_words.clear() elif sub_no_a_words: if len(aps) > 0: aps[-1].extend(sub_no_a_words) else: no_a_words.extend(sub_no_a_words) if no_a_words: no_a_words.insert(0, "None") aps.append(no_a_words) return aps
def split_entity(label_sequence): """ 从标签序列中抽取实体 >>> label_sequence=[['O', 'B', 'O', 'B', 'I', 'B'], ['O', 'O', 'B']] >>> chunks=[('_', 1, 1), ('_', 3, 4), ('_', 5, 5), ('_', 9, 9)] :param label_sequence: :return: list of (chunk_type, chunk_start, chunk_end). """ return get_entities(label_sequence)
def process_tokenized_sentence_document(doc: TokenizedSentenceDocument): sents = doc.sent_tokens metadata = doc.metadata logger.warn('Received document labeled %s with %d sentences' % (metadata, len(sents))) instances = [] start_time = time() for sent_ind, token_list in enumerate(sents): inst_str = create_instance_string(token_list) logger.debug('Instance string is %s' % (inst_str)) instances.append(inst_str) dataset = TemporalDocumentDataset.from_instance_list(instances, app.state.tokenizer) logger.warn('Dataset is as follows: %s' % (str(dataset.features))) preproc_end = time() output = app.state.trainer.predict(test_dataset=dataset) timex_predictions = np.argmax(output.predictions[0], axis=2) timex_results = [] event_results = [] relation_results = [] pred_end = time() for sent_ind in range(len(dataset)): tokens = app.state.tokenizer.convert_ids_to_tokens(dataset.features[sent_ind].input_ids) wpind_to_ind = {} timex_labels = [] for token_ind in range(1,len(tokens)): if dataset[sent_ind].input_ids[token_ind] <= 2: break if tokens[token_ind].startswith('Ġ'): wpind_to_ind[token_ind] = len(wpind_to_ind) timex_labels.append(timex_label_list[timex_predictions[sent_ind][token_ind]]) timex_entities = get_entities(timex_labels) logging.info("Extracted %d timex entities from the sentence" % (len(timex_entities))) timex_results.append( [Timex(timeClass=label[0], begin=label[1], end=label[2]) for label in timex_entities] ) event_results.append( [] ) relation_results.append( [] ) results = TemporalResults(timexes=timex_results, events=event_results, relations=relation_results) postproc_end = time() preproc_time = preproc_end - start_time pred_time = pred_end - preproc_end postproc_time = postproc_end - pred_end logging.info("Pre-processing time: %f, processing time: %f, post-processing time %f" % (preproc_time, pred_time, postproc_time)) return results
def extract_gold_entities_multi_token(tokens, gold_bio_tags): gold_tags = set(get_entities(gold_bio_tags)) gold_entities = [] for gold_tag in gold_tags: entity_tokens = (' '.join(tokens[gold_tag[1]:gold_tag[2] + 1])) gold_entities.append( [gold_tag[0], entity_tokens, gold_tag[1], gold_tag[2]]) return gold_entities
def _build_response(self, sent, tags, probs): words = self.tokenize(sent) res = { 'words': words, 'entities': [], 'terms': [], 'head_rels': [] } tag_ner, tag_term, tag_rel = tags prob_ner, prob_term, prob_rel = probs chunks_ner = sequence_labeling.get_entities(tag_ner) chunks_term = sequence_labeling.get_entities(tag_term) for chunk_type, chunk_start, chunk_end in chunks_ner: chunk_end += 1 entity = { 'text': ' '.join(words[chunk_start: chunk_end]), 'type': chunk_type, 'score': float(np.average(prob_ner[chunk_start: chunk_end])), 'beginOffset': chunk_start, 'endOffset': chunk_end } res['entities'].append(entity) for chunk_type, chunk_start, chunk_end in chunks_term: chunk_end += 1 term = { 'text': ' '.join(words[chunk_start: chunk_end]), 'type': chunk_type, 'score': float(np.average(prob_ner[chunk_start: chunk_end])), 'beginOffset': chunk_start, 'endOffset': chunk_end } res['terms'].append(term) for i, tag in enumerate(tag_rel): if tag: rel = { 'text': words[i], 'score': f"{round(prob_rel[i], 4)}", 'offset': i } res['head_rels'].append(rel) return res
def transform(self, X, y=None): """Transform documents to document ids. Uses the vocabulary learned by fit. Args: X : iterable an iterable which yields either str, unicode or file objects. y : iterabl, label strings. Returns: features: document id matrix. y: label id matrix. """ mentions = [] mentions_char = [] left_contexts = [] right_contexts = [] outputs = [] word_ids = [self._word_vocab.doc2id(doc) for doc in X] char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X] ngram_indices = [] for sent in word_ids: ngrams = self.generate_ngrams(sent, n=4) ngram_indices.append(ngrams) for l, r in ngrams: mentions.append(word_ids[l:r]) mentions_char.append(char_ids[l:r]) left_contexts.append(word_ids[:l]) right_contexts.append(word_ids[r:]) if y is not None: for ngram, labels in zip(ngram_indices, y): d = {(begin_offset, end_offset + 1): t for t, begin_offset, end_offset in get_entities(labels)} for l, r in ngram: if (l, r) in d: outputs.append(self._label_vocab[d[(l, r)]]) else: outputs.append(self._label_vocab) outputs = np.array(outputs) inputs = [ np.array(left_contexts), np.array(mentions), np.array(mentions_char), np.array(right_contexts) ] if y is not None: return inputs, outputs else: return inputs
def summary_data(tags): total_entity = [] for sen_tag in tags: entitys = get_entities(sen_tag) entitys = [ele[0] for ele in entitys] total_entity += entitys unique, counts = np.unique(total_entity, return_counts=True) print('Entities for training:\n', dict(zip(unique, counts))) return
def calc_char_offset(cls, words, tags): doc = ' '.join(words) j = {'text': ' '.join(words), 'labels': []} pos = defaultdict(int) for label, start_offset, end_offset in get_entities(tags): entity = ' '.join(words[start_offset:end_offset + 1]) char_left = doc.index(entity, pos[entity]) char_right = char_left + len(entity) span = [char_left, char_right, label] j['labels'].append(span) pos[entity] = char_right return j
def _build_response1(self, sent, tags, prob): words = self.tokenizer(sent) res = "" chunks = get_entities(tags) for index, obj in enumerate(words): res = res + obj +"\t"+tags[index] +"\n" if "." in obj: res = res+"\n" if "ред" in obj: res = res+"\n" return res
def extract_tp_actual_correct(y_true, y_pred, suffix, *args): entities_true = defaultdict(set) entities_pred = defaultdict(set) for type_name, start, end in get_entities(y_true, suffix): entities_true[type_name].add((start, end)) for type_name, start, end in get_entities(y_pred, suffix): entities_pred[type_name].add((start, end)) target_names = sorted(set(entities_true.keys()) | set(entities_pred.keys())) tp_sum = np.array([], dtype=np.int32) pred_sum = np.array([], dtype=np.int32) true_sum = np.array([], dtype=np.int32) for type_name in target_names: entities_true_type = entities_true.get(type_name, set()) entities_pred_type = entities_pred.get(type_name, set()) tp_sum = np.append(tp_sum, len(entities_true_type & entities_pred_type)) pred_sum = np.append(pred_sum, len(entities_pred_type)) true_sum = np.append(true_sum, len(entities_true_type)) return pred_sum, tp_sum, true_sum
def merge_col_from_tag(row, col, tag): """ The function merged items in the list from designated row[col] according to row[tag] col is the column of token list such as [this, is, three, dollar] tag is the column of tag list such as [O, O, B-TBNorm, I-TBNorm] Return: [this, is, three dollar] If the row[col] is tag, it will return [O, O, B], which is the tag for the result above """ l = row[col].copy() if col in ['tag', 'tag_pred']: for tup in get_entities(row[tag])[::-1]: for i in range(tup[1], tup[2] + 1): l.pop(tup[1]) l.insert(tup[1], 'B') else: for tup in get_entities(row[tag])[::-1]: text = row[col][tup[1]:tup[2] + 1] for i in range(tup[1], tup[2] + 1): l.pop(tup[1]) l.insert(tup[1], ' '.join(text)) return l
def test_calc_char_offset(self): words = ['EU', 'rejects', 'German', 'call'] tags = ['B-ORG', 'O', 'B-MISC', 'O'] entities = get_entities(tags) actual = CoNLLParser.calc_char_offset(words, tags) self.assertEqual(entities, [('ORG', 0, 0), ('MISC', 2, 2)]) self.assertEqual( actual, { 'text': 'EU rejects German call', 'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']] })