def _get_mention_splits(doc, mention, seek, span): mention_start_seek_offset = _.index_of(doc[seek:], mention) mention_start_sentence_offset = seek - span[0] + mention_start_seek_offset to_idx = mention_start_sentence_offset + len(mention) sentence = doc[span[0]:span[1]] return ([parse_for_tokens(sentence[:mention_start_sentence_offset] + mention), parse_for_tokens(mention + sentence[to_idx:])], span[0] + to_idx)
def get_mention_sentence_splits(page_content, sentence_spans, mention_info): mention_len = len(mention_info['mention']) sentence_span = _merge_sentences_across_mention(sentence_spans, mention_info['offset'], mention_len) sentence = page_content[sentence_span[0]:sentence_span[1]] mention_index = sentence.index(mention_info['mention']) return [ parse_for_tokens(sentence[:mention_index + mention_len]), parse_for_tokens(sentence[mention_index:]) ]
def get_mention_sentence(page_content, sentence_spans, mention_info, lim=None): mention_len = len(mention_info['mention']) sentence_span = _merge_sentences_across_mention(sentence_spans, mention_info['offset'], mention_len) return parse_for_tokens( page_content[sentence_span[0]:sentence_span[1]])[:lim]
def get_mention_sentence_splits(page_content, sentence_spans, mention_info, lim=None): mention_len = len(mention_info['mention']) sentence_span = _merge_sentences_across_mention(sentence_spans, mention_info['offset'], mention_len) sentence = page_content[sentence_span[0]:sentence_span[1]] mention_index = sentence.index(mention_info['mention']) if lim is not None: return [ parse_for_tokens(sentence[:mention_index + mention_len])[-lim // 2:], parse_for_tokens(sentence[mention_index:])[:lim // 2] ] else: return [ parse_for_tokens(sentence[:mention_index + mention_len]), parse_for_tokens(sentence[mention_index:]) ]
def _get_entity_tokens(self, num_entities): mapper = lambda token: self.lookups.token_idx_lookup[ token ] if token in self.lookups.token_idx_lookup else self.lookups.token_idx_lookup[ '<UNK>'] entity_indexed_tokens = { self.lookups.entity_labels[entity_id]: _.map_(parse_for_tokens(text), mapper) for entity_id, text in get_entity_text().items() if entity_id in self.lookups.entity_labels } entity_indexed_tokens_list = [ entity_indexed_tokens[i] if i in entity_indexed_tokens else [1] for i in range(num_entities) ] return torch.tensor(pad_batch_list(0, entity_indexed_tokens_list), device=self.device)
def _get_mention_sentence(doc, mention, seek, span): mention_start_seek_offset = _.index_of(doc[seek:], mention) mention_start_sentence_offset = seek - span[0] + mention_start_seek_offset to_idx = mention_start_sentence_offset + len(mention) sentence = doc[span[0]:span[1]] return (parse_for_tokens(sentence), span[0] + to_idx)