def add_conll_utterance(self, parsed, tokens, corefs, speaker_id, use_gold_mentions, debug=False): conll_lookup = self.get_conll_spacy_lookup(tokens, parsed) self.conll_tokens.append(tokens) self.conll_lookup.append(conll_lookup) # Convert conll tokens coref index in spacy tokens indexes identified_gold = [False] * len(corefs) for coref in corefs: missing_values = [ key for key in [ 'label', 'start', 'end', ] if coref.get(key, None) is None ] if missing_values: found_values = { key: coref[key] for key in ['label', 'start', 'end'] if coref.get(key, None) is not None } raise Exception( f"Coref {self.name} with fields {found_values} has empty values for the keys {missing_values}." ) coref["start"] = conll_lookup[coref["start"]][0] coref["end"] = conll_lookup[coref["end"]][-1] if speaker_id not in self.speakers: speaker_name = speaker_id.split("_") if debug: print("New speaker: ", speaker_id, "name: ", speaker_name) self.speakers[speaker_id] = Speaker(speaker_id, speaker_name) if use_gold_mentions: for coref in corefs: # print("coref['label']", coref['label']) # print("coref text",parsed[coref['start']:coref['end']+1]) mention = Mention( parsed[coref["start"]:coref["end"] + 1], len(self.mentions), len(self.utterances), self.n_sents, speaker=self.speakers[speaker_id], gold_label=coref["label"], ) self.mentions.append(mention) # print("mention: ", mention, "label", mention.gold_label) else: mentions_spans = extract_mentions_spans(doc=parsed, blacklist=self.blacklist) self._process_mentions( mentions_spans, len(self.utterances), self.n_sents, self.speakers[speaker_id], ) # Assign a gold label to mentions which have one if debug: print("Check corefs", corefs) for i, coref in enumerate(corefs): for m in self.mentions: if m.utterance_index != len(self.utterances): continue # if debug: print("Checking mention", m, m.utterance_index, m.start, m.end) if coref["start"] == m.start and coref["end"] == m.end - 1: m.gold_label = coref["label"] identified_gold[i] = True # if debug: print("Gold mention found:", m, coref['label']) for found, coref in zip(identified_gold, corefs): if not found: self.missed_gold.append([ self.name, self.part, str(len(self.utterances)), parsed.text, parsed[coref["start"]:coref["end"] + 1].text, ]) if debug: print( "❄️ gold mention not in predicted mentions", coref, parsed[coref["start"]:coref["end"] + 1], ) self.utterances.append(parsed) self.gold_corefs.append(corefs) self.utterances_speaker.append(self.speakers[speaker_id]) self.n_sents += len(list(parsed.sents))
def add_conll_utterance(self, parsed, tokens, corefs, speaker_id, use_gold_mentions, debug=False): conll_lookup = self.get_conll_spacy_lookup(tokens, parsed) self.conll_tokens.append(tokens) self.conll_lookup.append(conll_lookup) # Convert conll tokens coref index in spacy tokens indexes identified_gold = [False] * len(corefs) for coref in corefs: assert (coref['label'] is not None and coref['start'] is not None and coref['end'] is not None), \ ("Error in coreference " + coref + " in " + parsed) coref['start'] = conll_lookup[coref['start']][0] coref['end'] = conll_lookup[coref['end']][-1] if speaker_id not in self.speakers: speaker_name = speaker_id.split(u'_') if debug: print("New speaker: ", speaker_id, "name: ", speaker_name) self.speakers[speaker_id] = Speaker(speaker_id, speaker_name) if use_gold_mentions: for coref in corefs: # print("coref['label']", coref['label']) # print("coref text",parsed[coref['start']:coref['end']+1]) mention = Mention(parsed[coref['start']:coref['end'] + 1], len(self.mentions), len(self.utterances), self.n_sents, speaker=self.speakers[speaker_id], gold_label=coref['label']) self.mentions.append(mention) # print("mention: ", mention, "label", mention.gold_label) else: mentions_spans = extract_mentions_spans(doc=parsed, blacklist=self.blacklist) self._process_mentions(mentions_spans, len(self.utterances), self.n_sents, self.speakers[speaker_id]) # Assign a gold label to mentions which have one if debug: print("Check corefs", corefs) for i, coref in enumerate(corefs): for m in self.mentions: if m.utterance_index != len(self.utterances): continue # if debug: print("Checking mention", m, m.utterance_index, m.start, m.end) if coref['start'] == m.start and coref['end'] == m.end - 1: m.gold_label = coref['label'] identified_gold[i] = True # if debug: print("Gold mention found:", m, coref['label']) for found, coref in zip(identified_gold, corefs): if not found: self.missed_gold.append([ self.name, self.part, str(len(self.utterances)), parsed.text, parsed[coref['start']:coref['end'] + 1].text ]) if debug: print("❄️ gold mention not in predicted mentions", coref, parsed[coref['start']:coref['end'] + 1]) self.utterances.append(parsed) self.gold_corefs.append(corefs) self.utterances_speaker.append(self.speakers[speaker_id]) self.n_sents += len(list(parsed.sents))