def parse(self, document, text):
        '''
        Transform spaCy output to match CoreNLP's default format
        :param document:
        :param text:
        :return:
        '''
        text = self.to_unicode(text)

        offset, position = 0, 0
        sentences = self.sent_boundary.apply(text)

        for sent, sent_offset in sentences:
            parts = defaultdict(list)
            tokens = self.tokenizer.apply(sent)
            if not tokens:
                continue

            parts['words'], parts['char_offsets'] = list(zip(*tokens))
            parts['abs_char_offsets'] = [
                idx + offset for idx in parts['char_offsets']
            ]
            parts['lemmas'] = []
            parts['pos_tags'] = []
            parts['ner_tags'] = []
            parts['dep_parents'] = []
            parts['dep_labels'] = []
            parts['position'] = position

            position += 1
            offset += len(sent)

            # Link the sentence to its parent document object
            parts['document'] = document
            parts['text'] = sent

            # Add null entity array (matching null for CoreNLP)
            parts['entity_cids'] = ['O' for _ in parts['words']]
            parts['entity_types'] = ['O' for _ in parts['words']]

            # Assign the stable id as document's stable id plus absolute
            # character offset
            abs_sent_offset = parts['abs_char_offsets'][0]
            abs_sent_offset_end = abs_sent_offset + parts['char_offsets'][
                -1] + len(parts['words'][-1])
            if document:
                parts['stable_id'] = construct_stable_id(
                    document, 'sentence', abs_sent_offset, abs_sent_offset_end)

            yield parts
示例#2
0
 def parse(self, document, contents):
     i = 0
     for text in contents.split(self.delim):
         if not len(text.strip()):
             continue
         words = text.split()
         char_offsets = [0] + list(np.cumsum([len(x) + 1
                                              for x in words]))[:-1]
         text = ' '.join(words)
         stable_id = construct_stable_id(document, 'phrase', i, i)
         yield {
             'text': text,
             'words': words,
             'char_offsets': char_offsets,
             'stable_id': stable_id
         }
         i += 1
示例#3
0
    def parse(self, document, text):
        '''
        Transform spaCy output to match CoreNLP's default format
        :param document:
        :param text:
        :return:
        '''
        text = self.to_unicode(text)

        doc = self.model.tokenizer(text)
        for proc in self.pipeline:
            proc(doc)
        assert doc.is_parsed

        position = 0
        for sent in doc.sents:
            parts = defaultdict(list)
            text = sent.text

            for i, token in enumerate(sent):
                parts['words'].append(str(token))
                parts['lemmas'].append(token.lemma_)
                parts['pos_tags'].append(token.tag_)
                parts['ner_tags'].append(
                    token.ent_type_ if token.ent_type_ else 'O')
                parts['char_offsets'].append(token.idx)
                parts['abs_char_offsets'].append(token.idx)
                head_idx = 0 if token.head is token else token.head.i - sent[
                    0].i + 1
                parts['dep_parents'].append(head_idx)
                parts['dep_labels'].append(token.dep_)

            # Add null entity array (matching null for CoreNLP)
            parts['entity_cids'] = ['O' for _ in parts['words']]
            parts['entity_types'] = ['O' for _ in parts['words']]

            # make char_offsets relative to start of sentence
            parts['char_offsets'] = [
                p - parts['char_offsets'][0] for p in parts['char_offsets']
            ]
            parts['position'] = position

            # Link the sentence to its parent document object
            parts['document'] = document
            parts['text'] = text

            # Add null entity array (matching null for CoreNLP)
            parts['entity_cids'] = ['O' for _ in parts['words']]
            parts['entity_types'] = ['O' for _ in parts['words']]

            # Assign the stable id as document's stable id plus absolute
            # character offset
            abs_sent_offset = parts['abs_char_offsets'][0]
            abs_sent_offset_end = abs_sent_offset + parts['char_offsets'][
                -1] + len(parts['words'][-1])
            if document:
                parts['stable_id'] = construct_stable_id(
                    document, 'sentence', abs_sent_offset, abs_sent_offset_end)

            position += 1

            yield parts
示例#4
0
        def parse_node(node, table_info=None, figure_info=None):
            if node.tag is etree.Comment:
                return
            if self.blacklist and node.tag in self.blacklist:
                return

            self.figure_idx = figure_info.enter_figure(node, self.figure_idx)

            if self.tabular:
                self.table_idx = table_info.enter_tabular(node, self.table_idx)

            # flattens children of node that are in the 'flatten' list
            if self.flatten:
                self._flatten(node)

            for field in ['text', 'tail']:
                text = getattr(node, field)
                if text is not None:
                    if self.strip:
                        text = text.strip()
                    if len(text):
                        for (rgx, replace) in self.replacements:
                            text = rgx.sub(replace, text)
                        self.contents += text
                        self.contents += self.delim
                        block_lengths.append(len(text) + len(self.delim))

                        for parts in self.lingual_parse(document, text):
                            (_, _, _,
                             char_end) = split_stable_id(parts['stable_id'])
                            try:
                                parts['document'] = document
                                parts['phrase_num'] = self.phrase_num
                                abs_phrase_offset_end = (
                                    self.abs_phrase_offset +
                                    parts['char_offsets'][-1] +
                                    len(parts['words'][-1]))
                                parts['stable_id'] = construct_stable_id(
                                    document, 'phrase', self.abs_phrase_offset,
                                    abs_phrase_offset_end)
                                self.abs_phrase_offset = abs_phrase_offset_end
                                if self.structural:
                                    context_node = node.getparent(
                                    ) if field == 'tail' else node
                                    parts['xpath'] = tree.getpath(context_node)
                                    parts['html_tag'] = context_node.tag
                                    parts['html_attrs'] = [
                                        '='.join(x) for x in list(
                                            context_node.attrib.items())
                                    ]
                                if self.tabular:
                                    parent = table_info.parent
                                    parts = table_info.apply_tabular(
                                        parts, parent, self.position)
                                yield Phrase(**parts)
                                self.position += 1
                                self.phrase_num += 1
                            except Exception as e:
                                # This should never happen
                                logger.exception(str(e))

            for child in node:
                if child.tag == 'table':
                    yield from parse_node(
                        child, TableInfo(document=table_info.document),
                        figure_info)
                elif child.tag == 'img':
                    yield from parse_node(
                        child, table_info,
                        FigureInfo(document=figure_info.document))
                else:
                    yield from parse_node(child, table_info, figure_info)

            if self.tabular:
                table_info.exit_tabular(node)

            figure_info.exit_figure(node)