Пример #1
0
class RuTokenizer(Tokenizer):
    def __init__(self, **kwargs):
        """
        Args:
            annotators: set that can include pos, lemma, and ner.
        """

        self.annotators = copy.deepcopy(kwargs.get('annotators', set()))
        self.include_pos = {'pos'} & self.annotators
        self.include_lemma = {'lemma'} & self.annotators
        self.include_ner = {'ner'} & self.annotators
        self.morph = pymorphy2.MorphAnalyzer()
        self.wt = WhitespaceTokenizer()
        self.rt = RegexpTokenizer(r'\w+')

    def __call__(self, text):

        # We don't treat new lines as tokens.
        clean_text = text.replace('\n', ' ')

        # remove punctuation
        clean_text = ' '.join(self.rt.tokenize(clean_text))

        # split by whitespaces and get spans
        spans = list(self.wt.span_tokenize(clean_text))
        n = len(spans)

        data = []
        for i in range(n):
            start_idx, end_idx = spans[i]

            token = clean_text[start_idx:end_idx]

            start_ws = start_idx
            if i + 1 < n:
                end_ws = spans[i + 1][0]
            else:
                end_ws = start_idx + len(token)

            token_ws = clean_text[start_ws:end_ws]

            lemma, pos, ent_type = '', '', ''
            if self.include_pos or self.include_lemma:
                p = self.morph.parse(token)[0]
                if self.include_lemma:
                    lemma = p.normal_form
                if self.include_pos:
                    pos = p.tag.POS

            if self.include_ner:
                entities = Text(token, hint_language_code='ru').entities
                if len(entities):
                    ent_type = entities[0].tag

            data.append((token, token_ws, spans[i], pos, lemma, ent_type))

        return Tokens(data, self.annotators, opts={'non_ent': ''})
class LingPipeParser(object):
    def __init__(self, config):
        self.clear()
        self.config = config

    def clear(self):
        self.tok_num = 0
        self.byte_idx = 0
        self.line_idx = 0
        self.word_tokenizer = WhitespaceTokenizer()

    def set(self, ner_dom):
        self.clear()
        ## nltk wants a unicode string, so decode, it and then we will
        ## re-encode it to carefully recover the byte offsets.  We
        ## must take care not to use any nltk components that insert
        ## new whitespace, such
        ## nltk.tokenize.treebank.TreebankTokenizer
        self.ner_dom = ner_dom
        self.attributes = []
        self.relations = []

    def sentences(self):
        '''
        Iterate over <s> XML-like tags and tokenize with nltk
        '''
        for sentence_id, node in enumerate(self.ner_dom.childNodes):
            ## increment the char index with any text before the <s>
            ## tag.  Crucial assumption here is that the LingPipe XML
            ## tags are inserted into the original byte array without
            ## modifying the portions that are not inside the
            ## LingPipe-added tags themselves.
            if node.nodeType == node.TEXT_NODE:
                ## we expect to only see TEXT_NODE instances with whitespace
                assert only_whitespace.match(node.data), repr(node.data)

                ## must convert back to utf-8 to have expected byte offsets
                self.byte_idx += len(node.data.encode('utf-8'))

                ## count full lines, i.e. only those that end with a \n
                # 'True' here means keep the trailing newlines
                for line in node.data.splitlines(True):
                    if line.endswith('\n'):
                        self.line_idx += 1
            else:
                logger.debug('getting tokens for sentence_id=%d' % sentence_id)
                more_sentence_remains = True
                while more_sentence_remains:
                    ## always a sentence
                    sent = Sentence()

                    ## this "node" came from for loop above, and it's
                    ## childNodes list might have been popped by a
                    ## previous pass through this while loop
                    tokens = iter( self.tokens( node ) )

                    while 1:
                        try:
                            tok = tokens.next()
                            sent.tokens.append(tok)
                            #logger.debug('got token: %r  %d %d' % (tok.token, tok.mention_id, tok.sentence_pos))

                        except StopIteration:
                            yield sent
                            more_sentence_remains = False
                            break

    def _make_token(self, start, end):
        '''
        Instantiates a Token from self._input_string[start:end]
        '''
        ## all thfift strings must be encoded first
        tok_string = self._input_string[start:end].encode('utf-8')
        if only_whitespace.match(tok_string):
            ## drop any tokens with only whitespace
            return None
        tok = Token()
        tok.token = tok_string
        tok.token_num = self.tok_num
        if 'BYTES' in self.config['offset_types']:
            tok.offsets[OffsetType.BYTES] = Offset(
                type =  OffsetType.BYTES,
                first=self.byte_idx + len(self._input_string[:start].encode('utf-8')),
                length=len(tok_string),
                value=self.config['offset_debugging'] and tok_string or None,
                )
        if 'LINES' in self.config['offset_types']:
            tok.offsets[OffsetType.LINES] = Offset(
                type =  OffsetType.LINES,
                first=self.line_idx,
                length=1,
                value=self.config['offset_debugging'] and tok_string or None,
                )
        self.tok_num += 1
        ## keep track of position within a sentence
        tok.sentence_pos = self.sent_pos
        self.sent_pos += 1
        return tok

    def tokens(self, sentence_dom):
        '''
        Tokenize all the words and preserve NER labels from ENAMEX tags
        '''
        ## keep track of sentence position, which is reset for each
        ## sentence, and used above in _make_token
        self.sent_pos = 0
    
        ## keep track of mention_id, so we can distinguish adjacent
        ## multi-token mentions within the same coref chain
        mention_id = 0

        while len(sentence_dom.childNodes) > 0:
            ## shrink the sentence_dom's child nodes.  In v0_2_0 this
            ## was required to cope with HitMaxi16.  Now it is just to
            ## save memory.
            node = sentence_dom.childNodes.pop(0)

            if node.nodeType == node.TEXT_NODE:
                ## process portion before an ENAMEX tag
                for line in node.data.splitlines(True):
                    self._input_string = line
                    for start, end in self.word_tokenizer.span_tokenize(line):
                        tok = self._make_token(start, end)
                        if tok:
                            yield tok

                    if line.endswith('\n'):
                        ## maintain the index to the current line
                        self.line_idx += 1

                    ## increment index pasat the 'before' portion
                    self.byte_idx += len(line.encode('utf-8'))

            else:
                ## process text inside an ENAMEX tag
                assert node.nodeName == 'ENAMEX', node.nodeName
                chain_id = node.attributes.get('ID').value
                entity_type = node.attributes.get('TYPE').value
                for node in node.childNodes:
                    assert node.nodeType == node.TEXT_NODE, node.nodeType
                    for line in node.data.splitlines(True):
                        self._input_string = line
                        for start, end in self.word_tokenizer.span_tokenize(line):
                            tok = self._make_token(start, end)
                            if tok:
                                if entity_type in _PRONOUNS:
                                    tok.mention_type = MentionType.PRO
                                    tok.entity_type = _ENTITY_TYPES[entity_type]
                                    
                                    ## create an attribute
                                    attr = Attribute(
                                        attribute_type=AttributeType.PER_GENDER,
                                        value=str(_PRONOUNS[entity_type])
                                        )
                                    self.attributes.append(attr)

                                else:
                                    ## regular entity_type
                                    tok.mention_type = MentionType.NAME
                                    tok.entity_type = _ENTITY_TYPES[entity_type]

                                tok.equiv_id = int(chain_id)
                                tok.mention_id = mention_id
                                yield tok

                        if line.endswith('\n'):
                            ## maintain the index to the current line
                            self.line_idx += 1

                        ## increment index pasat the 'before' portion
                        self.byte_idx += len(line.encode('utf-8'))

                ## increment mention_id within this sentence
                mention_id += 1
Пример #3
0
class nltk_tokenizer(IncrementalTransform):
    '''
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    '''
    config_name = 'nltk_tokenizer'
    tagger_id = 'nltk_tokenizer'

    def __init__(self, *args, **kwargs):
        super(nltk_tokenizer, self).__init__(*args, **kwargs)
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer()  #PunktWordTokenizer()

    def _sentences(self, clean_visible):
        'generate strings identified as sentences'
        previous_end = 0
        clean_visible = clean_visible.decode('utf8')
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        'make a sortedcollection on body.labels'
        labels = stream_item.body.labels.get(self.config.get('annotator_id'))
        if not labels:
            labels = []

        self.label_index = SortedCollection(
            labels, key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(
                stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode('utf8')
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')",
                                    start,
                                    end,
                                    exc_info=True)
                    sys.exit('failed to cope with %r in %r' %
                             (sent_str[start:end], sent_str))
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES,
                    first=sent_start + start,
                    length=end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' %
                                    label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                    tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences
Пример #4
0
class LingPipeParser(object):
    def __init__(self, config):
        self.config = config
        self.clear()

    def clear(self):
        self.tok_num = 0
        self.byte_idx = 0
        self.line_idx = 0
        self.word_tokenizer = WhitespaceTokenizer()

    def set(self, ner_dom):
        self.clear()
        ## nltk wants a unicode string, so decode, it and then we will
        ## re-encode it to carefully recover the byte offsets.  We
        ## must take care not to use any nltk components that insert
        ## new whitespace, such
        ## nltk.tokenize.treebank.TreebankTokenizer
        self.ner_dom = ner_dom
        self.attributes = []
        self.relations = []

    def sentences(self):
        '''
        Iterate over <s> XML-like tags and tokenize with nltk
        '''
        for sentence_id, node in enumerate(self.ner_dom.childNodes):
            ## increment the char index with any text before the <s>
            ## tag.  Crucial assumption here is that the LingPipe XML
            ## tags are inserted into the original byte array without
            ## modifying the portions that are not inside the
            ## LingPipe-added tags themselves.
            if node.nodeType == node.TEXT_NODE:
                ## we expect to only see TEXT_NODE instances with whitespace
                assert only_whitespace.match(node.data), repr(node.data)

                ## must convert back to utf-8 to have expected byte offsets
                self.byte_idx += len(node.data.encode('utf-8'))

                ## count full lines, i.e. only those that end with a \n
                # 'True' here means keep the trailing newlines
                for line in node.data.splitlines(True):
                    if line.endswith('\n'):
                        self.line_idx += 1
            else:
                logger.debug('getting tokens for sentence_id=%d' % sentence_id)
                more_sentence_remains = True
                while more_sentence_remains:
                    ## always a sentence
                    sent = Sentence()

                    ## this "node" came from for loop above, and it's
                    ## childNodes list might have been popped by a
                    ## previous pass through this while loop
                    tokens = iter(self.tokens(node))

                    while 1:
                        try:
                            tok = tokens.next()
                            sent.tokens.append(tok)
                            #logger.debug('got token: %r  %d %d' % (tok.token, tok.mention_id, tok.sentence_pos))

                        except StopIteration:
                            yield sent
                            more_sentence_remains = False
                            break

    def _make_token(self, start, end):
        '''
        Instantiates a Token from self._input_string[start:end]
        '''
        ## all thfift strings must be encoded first
        tok_string = self._input_string[start:end].encode('utf-8')
        if only_whitespace.match(tok_string):
            ## drop any tokens with only whitespace
            return None
        tok = Token()
        tok.token = tok_string
        tok.token_num = self.tok_num
        if 'BYTES' in self.config['offset_types']:
            tok.offsets[OffsetType.BYTES] = Offset(
                type=OffsetType.BYTES,
                first=self.byte_idx +
                len(self._input_string[:start].encode('utf-8')),
                length=len(tok_string),
                value=self.config['offset_debugging'] and tok_string or None,
            )
        if 'LINES' in self.config['offset_types']:
            tok.offsets[OffsetType.LINES] = Offset(
                type=OffsetType.LINES,
                first=self.line_idx,
                length=1,
                value=self.config['offset_debugging'] and tok_string or None,
            )
        self.tok_num += 1
        ## keep track of position within a sentence
        tok.sentence_pos = self.sent_pos
        self.sent_pos += 1
        return tok

    def tokens(self, sentence_dom):
        '''
        Tokenize all the words and preserve NER labels from ENAMEX tags
        '''
        ## keep track of sentence position, which is reset for each
        ## sentence, and used above in _make_token
        self.sent_pos = 0

        ## keep track of mention_id, so we can distinguish adjacent
        ## multi-token mentions within the same coref chain
        mention_id = 0

        while len(sentence_dom.childNodes) > 0:
            ## shrink the sentence_dom's child nodes.  In v0_2_0 this
            ## was required to cope with HitMaxi16.  Now it is just to
            ## save memory.
            node = sentence_dom.childNodes.pop(0)

            if node.nodeType == node.TEXT_NODE:
                ## process portion before an ENAMEX tag
                for line in node.data.splitlines(True):
                    self._input_string = line
                    for start, end in self.word_tokenizer.span_tokenize(line):
                        tok = self._make_token(start, end)
                        if tok:
                            yield tok

                    if line.endswith('\n'):
                        ## maintain the index to the current line
                        self.line_idx += 1

                    ## increment index pasat the 'before' portion
                    self.byte_idx += len(line.encode('utf-8'))

            else:
                ## process text inside an ENAMEX tag
                assert node.nodeName == 'ENAMEX', node.nodeName
                chain_id = node.attributes.get('ID').value
                entity_type = node.attributes.get('TYPE').value
                for node in node.childNodes:
                    assert node.nodeType == node.TEXT_NODE, node.nodeType
                    for line in node.data.splitlines(True):
                        self._input_string = line
                        for start, end in self.word_tokenizer.span_tokenize(
                                line):
                            tok = self._make_token(start, end)
                            if tok:
                                if entity_type in _PRONOUNS:
                                    tok.mention_type = MentionType.PRO
                                    tok.entity_type = _ENTITY_TYPES[
                                        entity_type]

                                    ## create an attribute
                                    attr = Attribute(
                                        attribute_type=AttributeType.
                                        PER_GENDER,
                                        value=str(_PRONOUNS[entity_type]))
                                    self.attributes.append(attr)

                                else:
                                    ## regular entity_type
                                    tok.mention_type = MentionType.NAME
                                    tok.entity_type = _ENTITY_TYPES[
                                        entity_type]

                                tok.equiv_id = int(chain_id)
                                tok.mention_id = mention_id
                                yield tok

                        if line.endswith('\n'):
                            ## maintain the index to the current line
                            self.line_idx += 1

                        ## increment index pasat the 'before' portion
                        self.byte_idx += len(line.encode('utf-8'))

                ## increment mention_id within this sentence
                mention_id += 1
Пример #5
0
random.seed(42)

# Load the training set
training_texts, training_spans = load_dataset(args.train_dir)

# Use the NLTK tokenizer
tokenizer = WhitespaceTokenizer()

# Get the tokenized posts and their labels
tokenized_posts, gold_labels = [], []

# Tokenize the texts and get the corresponding labels
for text, span in zip(training_texts, training_spans):
    # Tokenize post
    tokens_offsets = tokenizer.span_tokenize(text)
    gold_offset_chars = set(span)

    # Determine label for each token
    tokenized_post, post_labels = [], []
    for i, j in tokens_offsets:
        # check if this token label is toxic
        toxic_label = 0
        for k in range(i, j):
            if k in gold_offset_chars:
                toxic_label = 1
                break

        # remove punctuation from the token
        tokenized_post.append(text[i:j].translate(
            str.maketrans('', '', string.punctuation)))
class nltk_tokenizer(IncrementalTransform):
    """
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    """

    tagger_id = "nltk_tokenizer"

    def __init__(self, config):
        self.config = config
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer()  # PunktWordTokenizer()

    def _sentences(self, clean_visible):
        "generate strings identified as sentences"
        previous_end = 0
        clean_visible = clean_visible.decode("utf8")
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        "make a sortedcollection on body.labels"
        labels = stream_item.body.labels.get(self.config.get("annotator_id"))
        if not labels:
            labels = []

        self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        "assemble Sentence and Token objects"
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode("utf8")
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True)
                    sys.exit("failed to cope with %r in %r" % (sent_str[start:end], sent_str))
                tok = Token(token_num=token_num, token=token_str, sentence_pos=sentence_pos)
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, first=sent_start + start, length=end - start
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    # logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info("overlapping label: %r" % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info("adding label to tok: %r has %r", tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences
Пример #7
0
 #outf.write("---BOS---\n") #Beginning of sentence
 sentence_re = r'''(?x)        # set flag to allow verbose regexps
     (?:[A-Z])(?:\.[A-Z])+\.?    # abbreviations, e.g. U.S.A.
     | \w+(?:-\w+)*            # words with optional internal hyphens
     | \$?\d+(?:\.\d+)?%?        # currency and percentages, e.g. $12.40, 82%
     | \.\.\.                # ellipsis
     | [][.,;"'?():-_`]        # these are separate tokens
 '''
 
 #Remove punctuation
 '''table = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
 sout=s.translate(table)'''
 
 tokenizer = WhitespaceTokenizer()
 tokenwords = WhitespaceTokenizer().tokenize(s)
 t_spans = tokenizer.span_tokenize(s)
 t_spans_l=[]
 
 #tokenwords2=word_tokenize(s)
 word_spans_=[]
 
 
 for w in t_spans:
     t_spans_l.append(w)
 #print(t_spans_l)
 k=0
 for t in tokenwords:
     
     #index=tokenwords.index(t)
     #print(t[len(t)-1])
     w=t_spans_l[k]
Пример #8
0
class ACEParser:
    def __init__(self):
        self.sent_tokenizer = PunktSentenceTokenizer()
        # self.word_tokenizer = RegexpTokenizer('\w+|\S+')
        self.word_tokenizer = WhitespaceTokenizer()
        self.root = None
        self.sentence_offsets = []
        self.df = pd.DataFrame(
            columns=["doc_id", "sentence", "tokens", "events", "entities"])

    def get_text(self, sgm_file):
        with open(sgm_file, "r", encoding="utf-8") as f:
            text = f.read()

        # Gets rid of lines with only tags
        text = re.sub(r"<(.|\s|\n)*?>", r"", text)
        sentence_offsets = list(self.sent_tokenizer.span_tokenize(text))
        sentences = []
        for offset in sentence_offsets:
            sentence_text = text[offset[0]:offset[1]]
            sentences.append(sentence_text)

        self.sentence_offsets = sentence_offsets
        return text

    def create_tree(self, apf_file):
        with open(apf_file, "r", encoding="utf-8") as f:
            xml_text = f.read()

        root = etree.fromstring(xml_text)
        self.root = root

    def get_extents(self):
        extent_nodes = self.root.xpath("//extent/charseq")
        return [
            self.get_offset_tuple(extent_node) for extent_node in extent_nodes
        ]

    def get_offset_tuple(self, extent_node):
        return (int(extent_node.get("START")), int(extent_node.get("END")) + 1
                )  # +1 makes them exclusive

    def get_sentences(self):
        sentences = []
        for offset in self.sentence_offsets:
            sentence_text = text[offset[0]:offset[1]]
            sentences.append(sentence_text)

        return sentences

    def find_sentence_index(self, offset):

        for i, sent_offset in enumerate(self.sentence_offsets):
            if offset[0] >= sent_offset[0] and offset[1] <= sent_offset[1]:
                return i

    def offset_to_token(self, start, end, token_offsets, normalize=0):
        # normalize is making start and end relatable to token_offsets
        start -= normalize
        end -= normalize

        # TODO: change this to if end == offset[1]. In the case that end < offset[1] use startswith and extend token_offsets list
        for i, offset in enumerate(token_offsets):
            if end <= offset[1]:
                for j in range(i, -1, -1):
                    if start >= token_offsets[j][0]:
                        return j, i + 1  # Make it exclusive

        raise Exception(
            "Error while converting offset to token indexes. Start offset : %d , End offset : %d Norm : %d, Token offsets : %s"
            % (start, end, normalize, str(token_offsets)))

    def create_json_output(self, doc_text, filename):
        # doc_id = self.root.xpath("document")[0].get("DOCID")
        doc_id = filename
        event_nodes = self.root.xpath("//event")

        # TODO: We lose coreference information doing it this way. For now it is ok, but need to accomodate the other way too !!!
        event_mentions = []
        for event_node in event_nodes:
            event_type = event_node.get("TYPE")
            event_subtype = event_node.get("SUBTYPE")
            event_id = event_node.get("ID")
            event_mention_nodes = event_node.xpath("event_mention")
            for mention_node in event_mention_nodes:
                # You actually don't need these two for finding which sentence we are talking about.
                # Because we already made sure that all of our extents are covered by sentence offsets.
                # extent_node = mention.xpath("/extent/charseq")[0]
                # extent = get_offset_tuple(extent_node)

                trigger_offset = self.get_offset_tuple(
                    mention_node.xpath("anchor/charseq")[0])

                # find which sentence this belongs. Only need to do this once.
                sent_idx = self.find_sentence_index(trigger_offset)

                event_arguments = []
                arguments = mention_node.xpath("event_mention_argument")
                for argument in arguments:
                    arg_role = argument.get("ROLE")
                    arg_offset = self.get_offset_tuple(
                        argument.xpath("extent/charseq")[0])
                    # TODO: NEED TO ADD ENTITY TYPES, getting them from refids !!!
                    event_arguments.append({
                        "role": arg_role,
                        "start": arg_offset[0],
                        "end": arg_offset[1]
                    })

                event_mentions.append({
                    "event_id": event_id,
                    "event_type": event_type,
                    "event_subtype": event_subtype,
                    "trigger": {
                        "start": trigger_offset[0],
                        "end": trigger_offset[1]
                    },
                    "arguments": event_arguments,
                    "sent_idx": sent_idx
                })

        # For printing later
        # old_event_mentions = copy.deepcopy(event_mentions)

        tokens_list_for_printing = []
        for i, sentence_offset in enumerate(self.sentence_offsets):
            sentence_text = doc_text[sentence_offset[0]:sentence_offset[1]]
            token_offsets = list(
                self.word_tokenizer.span_tokenize(sentence_text))
            tokens = [
                sentence_text[offset[0]:offset[1]] for offset in token_offsets
            ]
            tokens_list_for_printing.append(tokens)
            entity_mentions = []
            curr_event_mentions = []

            for j in range(len(event_mentions)):
                mention = event_mentions[j]
                if mention["sent_idx"] == i:
                    # ipdb.set_trace()
                    start_idx, end_idx = self.offset_to_token(
                        mention["trigger"]["start"],
                        mention["trigger"]["end"],
                        token_offsets,
                        normalize=sentence_offset[0])
                    event_mentions[j]["trigger"]["start"] = start_idx
                    event_mentions[j]["trigger"]["end"] = end_idx

                    for k, argument in enumerate(mention["arguments"]):
                        start_idx, end_idx = self.offset_to_token(
                            argument["start"],
                            argument["end"],
                            token_offsets,
                            normalize=sentence_offset[0])
                        event_mentions[j]["arguments"][k]["start"] = start_idx
                        event_mentions[j]["arguments"][k]["end"] = end_idx

                    curr_event_mentions.append(event_mentions[j])

            self.df = self.df.append(
                {
                    "doc_id": doc_id,
                    "sentence": sentence_text,
                    "tokens": tokens,
                    "events": curr_event_mentions,
                    "entities": entity_mentions
                },
                ignore_index=True)

        # Printing stuff
        # for mention, old_mention in zip(event_mentions, old_event_mentions):
        #     tokens = tokens_list_for_printing[mention["sent_idx"]]
        #     print("Offset version trigger : %s , Tokens version trigger : %s" %(doc_text[old_mention["trigger"]["start"]:old_mention["trigger"]["end"]], tokens[mention["trigger"]["start"]:mention["trigger"]["end"]]))
        #     for argument, old_argument in zip(mention["arguments"], old_mention["arguments"]):
        #         print("Offset version argument : %s , Tokens version argument : %s" %(doc_text[old_argument["start"]:old_argument["end"]], tokens[argument["start"]:argument["end"]]))

        #     print("===========")

    # TODO: Remove debug stuff
    def fix_offsets(self, extents):
        offsets = self.sentence_offsets
        assert (len(offsets) > 1)
        # print(offsets)
        # print("*************")

        after_count = 0
        before_count = 0
        for extent in extents:
            # Check stuff for printing
            if len([
                    offset for offset in offsets
                    if extent[0] >= offset[0] and extent[1] <= offset[1]
            ]) == 0:
                before_count += 1

            if extent[1] <= offsets[0][1]:
                continue

            for idx in range(1, len(offsets)):
                offset = offsets[idx]
                if extent[1] <= offset[1]:  # Ends before this sentence.
                    if extent[0] < offset[0]:  # Starts before this sentence
                        # Fixing
                        # print("-------")
                        # print(extent)
                        # print(offsets)
                        for j in range(
                                idx - 1, -1, -1
                        ):  # For all sentences' offsets before this offset
                            del offsets[j + 1]
                            if extent[0] >= offsets[j][0]:
                                offsets[j] = (offsets[j][0], offset[1])
                                break

                        # print(offsets)
                        break

                    else:  # Nothing wrong with this extent
                        break

            # Check stuff for printing
            if len([
                    offset for offset in offsets
                    if extent[0] >= offset[0] and extent[1] <= offset[1]
            ]) == 0:
                ipdb.set_trace()
                # MISSES some due to spaces between sentences
                # print(extent)
                # print(text[extent[0]:extent[1]])
                after_count += 1

        # print("Before : %d -> After : %d" %(before_count, after_count))
        # print("================================================================================================================")

        self.sentence_offsets = offsets
Пример #9
0
import pymysql
from nltk.tokenize import WhitespaceTokenizer

connection = pymysql.connect(host="127.0.0.1",
                             user="******",
                             password="******",
                             charset='utf8',
                             db='tf-idf',
                             cursorclass=pymysql.cursors.DictCursor)

cursor = connection.cursor()

terms = ['debut', 'two', 'language', 'also']
tokenizer = WhitespaceTokenizer()

sql = 'SELECT * FROM wiki'
cursor.execute(sql)
for record in cursor.fetchall():
    doc_id = record['id']
    text = record['text']
    for term in terms:
        for start, end in tokenizer.span_tokenize(text):
            if text[start:end].lower() == term:
                insert_sql = 'INSERT INTO inverted_index VALUES (%s, %s)'
                cursor.execute(insert_sql, (term, doc_id))
                break

connection.commit()
connection.close()
Пример #10
0
class InvertedIndex:
    ''' Main Inverted-Index structure'''
    def __init__(self):
        self._tokenizer = WhitespaceTokenizer()
        self._index_cache = IndexCache()
        self._stop_words = set(stopwords.words('english'))
        self._stemmer = SnowballStemmer("english")
        self._max_documents_per_shard = 50000
        self._num_documents_in_current_shard = 0
        if os.path.isfile("index_data/index.meta"):
            self._num_documents_in_current_shard = pickle.load(
                open("index_data/index.meta"))

    def search(self, query):
        combined_results = None
        ret_results = None
        for i in range(0, len(query), 2):
            op = query[i]
            keyword = self._stemmer.stem(query[i + 1].strip(
                string.punctuation))
            keyword_results = self._search_keyword(keyword)
            if combined_results:
                if op == "AND":
                    combined_results = combined_results.intersection(
                        set(keyword_results.keys()))
                elif op == "OR":
                    combined_results = combined_results.union(
                        set(keyword_results.keys()))
                else:
                    return {"status": False, "message": "Malformed query"}
                for doc in ret_results.keys():
                    if doc not in combined_results:
                        del ret_results[doc]
                    elif keyword_results.get(doc):
                        ret_results[doc].union(keyword_results[doc])
                for doc in keyword_results:
                    if doc not in ret_results:
                        ret_results[doc] = keyword_results[doc]
            else:
                combined_results = set(keyword_results.keys())
                ret_results = keyword_results
        result_counts = dict()
        for el in ret_results:
            result_counts[el] = len(ret_results[el])
        sorted_result_counts = sorted(result_counts.items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)
        sorted_results = []
        for key, _ in sorted_result_counts:
            sorted_results.append({"key": key, "positions": ret_results[key]})
        if len(sorted_results) > 0:
            ret = {"status": True, "results": sorted_results}
        else:
            ret = {"status": False, "message": "No hits"}
        return ret

    def _search_keyword(self, query):
        docs = self._index_cache.get(query)
        if not docs:
            return dict()
        return docs

    def add(self, key, text):
        self._num_documents_in_current_shard += 1
        if self._num_documents_in_current_shard > self._max_documents_per_shard:
            self._num_documents_in_current_shard = 0
            self._index_cache.create_new_shard()
        token_positions = self._tokenizer.span_tokenize(text)
        for pos in token_positions:
            start_pos = pos[0]
            end_pos = pos[1]
            token = text[start_pos:end_pos].lower()
            if token in self._stop_words:
                continue
            token = token.strip(string.punctuation)
            token = self._stemmer.stem(token)
            if len(token) > 0:
                self._index_cache.add(token, key, (start_pos, end_pos))

    def delete(self, key, text):
        pass

    def save(self):
        pickle.dump(self._num_documents_in_current_shard,
                    open("index_data/index.meta", "wb"))
        self._index_cache.flush()
class nltk_tokenizer(IncrementalTransform):
    '''
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    '''
    config_name = 'nltk_tokenizer'
    tagger_id = 'nltk_tokenizer'
    def __init__(self, *args, **kwargs):
        super(nltk_tokenizer, self).__init__(*args, **kwargs)
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer()

    def _sentences(self, clean_visible):
        'generate strings identified as sentences'
        previous_end = 0
        clean_visible = clean_visible.decode('utf8')
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if  start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        'make a sortedcollection on body.labels'
        labels = stream_item.body.labels.get(self.config.get('annotator_id'))
        if not labels:
            labels = []

        self.label_index = SortedCollection(
            labels,
            key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                token_str = sent_str[start:end].encode('utf8')
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, 
                    first=sent_start + start,
                    length = end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                     tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences

    def process_item(self, stream_item, context=None):
        if not hasattr(stream_item.body, 'clean_visible') or not stream_item.body.clean_visible:
            return stream_item
            
        self.label_index = None
        self.label_to_mention_id = dict()
        stream_item.body.sentences[self.tagger_id] = self.make_sentences(stream_item)

        return stream_item

    def __call__(self, stream_item, context=None):
        ## support the legacy callable API
        return self.process_item(stream_item, context)