def parse(self, document, text): ''' Transform spaCy output to match CoreNLP's default format :param document: :param text: :return: ''' text = self.to_unicode(text) offset, position = 0, 0 sentences = self.sent_boundary.apply(text) for sent, sent_offset in sentences: parts = defaultdict(list) tokens = self.tokenizer.apply(sent) if not tokens: continue parts['words'], parts['char_offsets'] = list(zip(*tokens)) parts['abs_char_offsets'] = [ idx + offset for idx in parts['char_offsets'] ] parts['lemmas'] = [] parts['pos_tags'] = [] parts['ner_tags'] = [] parts['dep_parents'] = [] parts['dep_labels'] = [] parts['position'] = position position += 1 offset += len(sent) # Link the sentence to its parent document object parts['document'] = document parts['text'] = sent # Add null entity array (matching null for CoreNLP) parts['entity_cids'] = ['O' for _ in parts['words']] parts['entity_types'] = ['O' for _ in parts['words']] # Assign the stable id as document's stable id plus absolute # character offset abs_sent_offset = parts['abs_char_offsets'][0] abs_sent_offset_end = abs_sent_offset + parts['char_offsets'][ -1] + len(parts['words'][-1]) if document: parts['stable_id'] = construct_stable_id( document, 'sentence', abs_sent_offset, abs_sent_offset_end) yield parts
def parse(self, document, contents): i = 0 for text in contents.split(self.delim): if not len(text.strip()): continue words = text.split() char_offsets = [0] + list(np.cumsum([len(x) + 1 for x in words]))[:-1] text = ' '.join(words) stable_id = construct_stable_id(document, 'phrase', i, i) yield { 'text': text, 'words': words, 'char_offsets': char_offsets, 'stable_id': stable_id } i += 1
def parse(self, document, text): ''' Transform spaCy output to match CoreNLP's default format :param document: :param text: :return: ''' text = self.to_unicode(text) doc = self.model.tokenizer(text) for proc in self.pipeline: proc(doc) assert doc.is_parsed position = 0 for sent in doc.sents: parts = defaultdict(list) text = sent.text for i, token in enumerate(sent): parts['words'].append(str(token)) parts['lemmas'].append(token.lemma_) parts['pos_tags'].append(token.tag_) parts['ner_tags'].append( token.ent_type_ if token.ent_type_ else 'O') parts['char_offsets'].append(token.idx) parts['abs_char_offsets'].append(token.idx) head_idx = 0 if token.head is token else token.head.i - sent[ 0].i + 1 parts['dep_parents'].append(head_idx) parts['dep_labels'].append(token.dep_) # Add null entity array (matching null for CoreNLP) parts['entity_cids'] = ['O' for _ in parts['words']] parts['entity_types'] = ['O' for _ in parts['words']] # make char_offsets relative to start of sentence parts['char_offsets'] = [ p - parts['char_offsets'][0] for p in parts['char_offsets'] ] parts['position'] = position # Link the sentence to its parent document object parts['document'] = document parts['text'] = text # Add null entity array (matching null for CoreNLP) parts['entity_cids'] = ['O' for _ in parts['words']] parts['entity_types'] = ['O' for _ in parts['words']] # Assign the stable id as document's stable id plus absolute # character offset abs_sent_offset = parts['abs_char_offsets'][0] abs_sent_offset_end = abs_sent_offset + parts['char_offsets'][ -1] + len(parts['words'][-1]) if document: parts['stable_id'] = construct_stable_id( document, 'sentence', abs_sent_offset, abs_sent_offset_end) position += 1 yield parts
def parse_node(node, table_info=None, figure_info=None): if node.tag is etree.Comment: return if self.blacklist and node.tag in self.blacklist: return self.figure_idx = figure_info.enter_figure(node, self.figure_idx) if self.tabular: self.table_idx = table_info.enter_tabular(node, self.table_idx) # flattens children of node that are in the 'flatten' list if self.flatten: self._flatten(node) for field in ['text', 'tail']: text = getattr(node, field) if text is not None: if self.strip: text = text.strip() if len(text): for (rgx, replace) in self.replacements: text = rgx.sub(replace, text) self.contents += text self.contents += self.delim block_lengths.append(len(text) + len(self.delim)) for parts in self.lingual_parse(document, text): (_, _, _, char_end) = split_stable_id(parts['stable_id']) try: parts['document'] = document parts['phrase_num'] = self.phrase_num abs_phrase_offset_end = ( self.abs_phrase_offset + parts['char_offsets'][-1] + len(parts['words'][-1])) parts['stable_id'] = construct_stable_id( document, 'phrase', self.abs_phrase_offset, abs_phrase_offset_end) self.abs_phrase_offset = abs_phrase_offset_end if self.structural: context_node = node.getparent( ) if field == 'tail' else node parts['xpath'] = tree.getpath(context_node) parts['html_tag'] = context_node.tag parts['html_attrs'] = [ '='.join(x) for x in list( context_node.attrib.items()) ] if self.tabular: parent = table_info.parent parts = table_info.apply_tabular( parts, parent, self.position) yield Phrase(**parts) self.position += 1 self.phrase_num += 1 except Exception as e: # This should never happen logger.exception(str(e)) for child in node: if child.tag == 'table': yield from parse_node( child, TableInfo(document=table_info.document), figure_info) elif child.tag == 'img': yield from parse_node( child, table_info, FigureInfo(document=figure_info.document)) else: yield from parse_node(child, table_info, figure_info) if self.tabular: table_info.exit_tabular(node) figure_info.exit_figure(node)