Exemplo n.º 1
0
    def tag(self, sent):
        times = self.find_time(sent)
        intervals = dict([(time[0], time[1]) for time in times])
        tag_dict = dict([(time[2], time[3]) for time in times])
        tokenizer = WordPunctTokenizer()
        # for a in [time[2] for time in times]:
        #     tokenizer.add_mwe(a.split())

        # --- FIXED ---
        original_tokens = tokenizer.tokenize(sent)
        original_tags = pos_tag(original_tokens)
        # --- END FIXED ---

        tokens = []
        current = 0
        for span in tokenizer.span_tokenize(sent):
            if span[0] < current:
                continue
            if span[0] in intervals:
                tokens.append(f'__{sent[span[0]: intervals[span[0]]]}')
                current = intervals[span[0]]
            else:
                tokens.append(sent[span[0]:span[1]])
                current = span[1]

        tags = pos_tag(tokens)

        new_tags = []
        for word, tag in tags:
            if word[:2] == '__':
                new_tags.append((word[2:], tag_dict[word[2:]]))
            else:
                tag = [t[1] for t in original_tags if t[0] == word][0]  # FIXED
                new_tags.append((word, tag))
        return new_tags
Exemplo n.º 2
0
def tokenize(text):
    """Tokenize a raw text.

    Args:
        text (str)

    Returns: list of {token, char1, char2, pos}
    """
    tokenizer = WordPunctTokenizer()

    # Get token character spans.
    spans = list(tokenizer.span_tokenize(text))

    # Materialize the token stream.
    tokens = [text[c1:c2] for c1, c2 in spans]

    # Tag parts-of-speech.
    tags = pos_tag(tokens)

    return [

        dict(
            token=token.lower(),
            char1=c1,
            char2=c2,
            pos=pos,
        )

        for (c1, c2), token, (_, pos) in
        zip(spans, tokens, tags)

    ]
Exemplo n.º 3
0
    def tokens(self):
        """Tokenize the text.
        """
        tokenizer = WordPunctTokenizer()

        # Get token character spans.
        spans = list(tokenizer.span_tokenize(self.text))

        # Materialize the token stream.
        tokens = [self.text[c1:c2] for c1, c2 in spans]

        tags = pos_tag(tokens)

        return [

            Token(
                token=token.lower(),
                char1=c1,
                char2=c2,
                pos=pos,
            )

            for (c1, c2), token, (_, pos) in
            zip(spans, tokens, tags)

        ]
Exemplo n.º 4
0
 def __get_words_boundaries(self):
     """
     function to tokenize words in the document and return words
     boundaries of each sentence using a tokenizer.
     :return:
     """
     tokenizer = WordPunctTokenizer()
     words = list(tokenizer.span_tokenize(self.text))
     return words
Exemplo n.º 5
0
def change_db2(text, origin_dict, id):
    print origin_dict
    tokens_ar = []
    word_punct_tokenizer = WordPunctTokenizer()
    for token in word_punct_tokenizer.span_tokenize(origin_dict):
        tokens_ar.append(token)
    for line in text.split("\n"):
        markup_error_line = line.split(';')
        print "MARKUP", markup_error_line
        convert_coord_2dbformat(markup_error_line, tokens_ar, id)
Exemplo n.º 6
0
def word_loss(chunk: str, losses: List[float]) -> List[float]:
    tokenizer = WordPunctTokenizer()
    spans = list(tokenizer.span_tokenize(chunk))
    # FIXME - this is a bit optimistic, do it properly:
    # gather outputs, check wordpunkt tokenizer regexp against characters
    return [
        sum(losses[start:end])
        # first and last might be incomplete
        for start, end in spans[1:-1]
    ]
Exemplo n.º 7
0
def convert(sgm_path, apf_path, bio_path=None):
    xml_parser = etree.XMLParser(recover=True)
    try:
        sgm_tree = etree.parse(sgm_path, xml_parser)
        apf_tree = etree.parse(apf_path, xml_parser)
        if not bio_path:
            bio_path = os.path.commonprefix([sgm_path, apf_path]) + 'bio'
        output = open(bio_path, 'w')
    except:
        print 'Something wrong when opening/parsing xml file, or opening output file'
        return
    
    init_offset = get_init_offset(sgm_path)
    text = sgm_tree.xpath('/DOC/BODY/TEXT')[0].text.strip('\n')
    
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    spans = list(tokenizer.span_tokenize(text))
    pos = pos_tag(tokens)
    
    ts = []
    for i in range(len(tokens)):
        t = token()
        t.text = tokens[i]
        t.pos = pos[i][1]
        t.span = (spans[i][0] + init_offset, spans[i][1] - 1 + init_offset)
        t.bio = 'O'
        ts.append(t)
        
    entits = apf_tree.xpath('/source_file/document/entity')
    for enty in entits:
        enty_type = enty.get('TYPE')
        mentions = enty.xpath('entity_mention')
        for m in mentions:
            head = m.xpath('head')[0]
            span = (int(head[0].get('START')), int(head[0].get('END')))
            found = False
            for t in ts:
                if t.span[0] == span[0]:
                    t.bio = 'B-' + enty_type
                    found = True
                if t.span[0] > span[0] and t.span[1] <= span[1]:
                    t.bio = 'I-' + enty_type
                    found = True
            if not found:
                print 'entity mention head span not found', span, apf_path
    
    for t in ts:
        #print t.text, t.span
        output.write('\t'.join([t.text, t.pos, t.bio]) + '\n')
    output.close()
Exemplo n.º 8
0
class NltkTokenizer(Tokenizer):

    """
    Tokenizer that uses WordPunctTokenizer from NLTK
    """

    tokenizer: WordPunctTokenizer

    def __init__(self) -> None:
        self.tokenizer = WordPunctTokenizer()

    def tokenize(self, text: str) -> List[Token]:
        spans = list(self.tokenizer.span_tokenize(text))
        words = [text[span_start:span_end] for span_start, span_end in spans]
        return [Token(word=word, span=span) for word, span in zip(words, spans)]
Exemplo n.º 9
0
            ann_path = os.path.join(brat_data_dirname, filename)
            if os.stat(ann_path).st_size > 0:
                doc_path = os.path.join(brat_data_dirname, '%s.txt' % basename)

                with open(doc_path, 'r') as doc_file, open(ann_path,
                                                           'r') as ann_file:
                    start_dict = {}
                    end_dict = {}
                    for (id, type_pos,
                         entity) in (tuple(line.strip().split('\t'))
                                     for line in ann_file):
                        (type, start, end) = type_pos.split(' ')
                        start_dict[start] = type
                        end_dict[end] = type

                    content = doc_file.read()
                    ann_content = ""

                    active_type = 'O'
                    for (start, end) in tokenizer.span_tokenize(content):
                        if str(start) in start_dict:
                            active_type = start_dict[str(start)]

                        #print("%s\t%s" % (content[start:end], active_type))
                        col_file.write("%s\t%s\n" %
                                       (content[start:end], active_type))

                        if str(end) in end_dict:
                            active_type = 'O'