Пример #1
0
    def __init__(self):
        # For parsing text file
        self.sent_tokenizer = SentenceTokenizer()
        self.word_tokenizer = WordTokenizer()

        # Internal representation natural for i2b2 format
        self.text = ''
        self.data = []  # list of list of tokens
        self.line_inds = []
        self.classifications = []
        self.fileName = 'no-file'
Пример #2
0
    def __init__(self):
        # For parsing text file
        self.sent_tokenizer = SentenceTokenizer()
        self.word_tokenizer = WordTokenizer()

        # Internal representation natural for i2b2 format
        self.text = ''
        self.data            = []  # list of list of tokens
        self.line_inds = []
        self.classifications = []
        self.fileName = 'no-file'
Пример #3
0
class Note_semeval(AbstractNote):

    def __init__(self):
        # For parsing text file
        self.sent_tokenizer = SentenceTokenizer()
        self.word_tokenizer = WordTokenizer()

        # Internal representation natural for i2b2 format
        self.text = ''
        self.data            = []  # list of list of tokens
        self.line_inds = []
        self.classifications = []
        self.fileName = 'no-file'


    def getExtension(self):
        return 'pipe'


    def getText(self):
        return self.text


    def getTokenizedSentences(self):
        return self.data


    def getClassificationTuples(self):
        return self.classifications


    def getLineIndices(self):
        return self.line_inds

    def read_standard(self, txt, con=None):

        start = 0
        end = 0

        with open(txt) as f:

            # Get entire file
            text = f.read()
            self.text = text

            # Sentence splitter
            sents = self.sent_tokenizer.tokenize(txt)

            # Tokenize each sentence into words (and save line number indices)
            toks = []
            gold = []          # Actual lines
            
            for s in sents:
                gold.append(s)

                # Store data
                toks = self.word_tokenizer.tokenize(s)
                self.data.append(toks)

                # Keep track of which indices each line has
                end = start + len(s)

                self.line_inds.append( (start,end) )
                start = end + 1

                # Skip ahead to next non-whitespace
                while (start < len(text)) and text[start].isspace(): start += 1


        # If an accompanying concept file was specified, read it
        if con:
            classifications = []
            with open(con) as f:
                for line in f:

                    # Empty line
                    if line == '\n': continue

                    # Parse concept file line
                    fields = line.strip().split('||')
                    #print fields
                    concept = fields[0]
                    span_inds = []
                    for i in range(1,len(fields),2):
                        span = int(fields[i]), int(fields[i+1])
                        span_inds.append( span )

                    #print '\t', concept
                    #print '\t', span_inds

                    classifications.append( (concept, span_inds) )

            # Concept file does not guarantee ordering by line number
            self.classifications = sorted(classifications, cmp=concept_cmp)




    def read(self, txt, con=None):            

        # Filename
        self.filename = os.path.split(txt)[1]

        start = 0
        end = 0
        with open(txt) as f:

            # Get entire file
            text = f.read()
            #print "\nTEXT:------------------"
            #print text

            self.text = text

            # Sentence splitter
            sents = self.sent_tokenizer.tokenize(txt)

            #print "\nSENTS:-----------------------------"
            #print sents

            # Tokenize each sentence into words (and save line number indices)
            toks = []
            gold = []          # Actual lines
            
            for s in sents:
           
                gold.append(s)

                #print "\nsentence:-------------------------------"
                #print s

                #print s

                # Store data
                toks = self.word_tokenizer.tokenize(s)

                #print "\ntokenized sentence:---------------------------------"
                #print toks

                self.data.append(toks)

                # Keep track of which indices each line has
                end = start + len(s)

                #print "\nindices:--------------------------------------------"
                #print (start, end)

                #print "\nusing index on entire txt----------------------------"
                #print text[start:end]

                #print "\nEQUAL?"
                #print text[start:end] == s

                self.line_inds.append( (start,end) )
                start = end + 1

                # Skip ahead to next non-whitespace
                while (start < len(text)) and text[start].isspace(): start += 1

            '''
            for line,inds in zip(gold,self.line_inds):
                print '!!!' + line + '!!!'
                print '\t', 'xx'*10
                print inds
                print '\t', 'xx'*10
                print '!!!' + text[inds[0]: inds[1]] + '!!!'
                print '---'
                print '\n'
                print 'Xx' * 20
            '''

        #lno,span = lineno_and_tokspan((2329, 2351))
        #lno,span = lineno_and_tokspan((1327, 1344))
        #print self.data[lno][span[0]:span[1]+1]


        # If an accompanying concept file was specified, read it
        if con:
            offset_classifications = []
            classifications = []
            with open(con) as f:
                for line in f:

                    # Empty line
                    if line == '\n': continue

                    # Parse concept file line
                    fields = line.strip().split('||')
                    #print fields
                    concept = fields[1]
                    cui     = fields[2]
                    span_inds = []
                    for i in range(3,len(fields),2):
                        span = int(fields[i]), int(fields[i+1])
                        span_inds.append( span )

                    #print '\t', concept
                    #print '\t', span_inds

                    # Everything is a Disease_Disorder
                    concept = 'problem'

                    # FIXME - For now, treat non-contiguous spans as separate
                    for span in span_inds:
                        #l,(start,end) = lineno_and_tokspan(span)
                        # Add the classification to the Note object
                        offset_classifications.append((concept,span[0],span[1]))
                    classifications.append( (concept, span_inds) )

            # Safe guard against concept file having duplicate entries
            #classifications = list(set(classifications))

            # Concept file does not guarantee ordering by line number
            self.classifications = sorted(classifications, cmp=concept_cmp)




    def write(self, labels):

        # If given labels to write, use them. Default to self.classifications
        if labels != None:
            # Translate token-level annotations to character offsets
            classifications = []
            for classification in labels:
                inds = self.line_inds
                data = self.data
                text = self.text
                
                # FIXME - Assumes that token-level does not have noncontig
                concept = classification[0]
                lno     = classification[1] - 1
                start   = classification[2]
                end     = classification[3]
                tokspan = start,end

                # Get character offset span                
                span = lno_and_tokspan__to__char_span(inds,data,text,lno,tokspan)
                classifications.append( (concept,span) )

        elif self.classifications != None:
            classifications = self.classifications
        else:
            raise Exception('Cannot write concept file: must specify labels')

        exit()

        # return value
        retStr = ''

        for concept,span_inds in classifications:
            retStr += self.fileName + '.text||%s||CUI-less' % concept
            for span in span_inds:
                retStr += '||' + str(span[0]) + "||" +  str(span[1])
            retStr += '\n'

        return retStr
Пример #4
0
class Note_semeval(AbstractNote):
    def __init__(self):
        # For parsing text file
        #        self.opennlp_tokenizer = OpenNLPTokenizer();

        self.sent_tokenizer = SentenceTokenizer()
        self.word_tokenizer = WordTokenizer()

        # Internal representation natural for i2b2 format
        self.text = ''
        self.data = []  # list of list of tokens
        self.line_inds = []
        self.classifications = []
        self.fileName = 'no-file'

    def getExtension(self):
        return 'pipe'

    def getText(self):
        return self.text

    def getTokenizedSentences(self):
        return self.data

    def getClassificationTuples(self):
        return self.classifications

    def getLineIndices(self):
        return self.line_inds

    def setFileName(self, fname):
        self.fileName = fname

    def read_standard(self, txt, con=None):

        start = 0
        end = 0

        with open(txt) as f:

            # Get entire file
            text = f.read()
            self.text = text

            # Sentence splitter

            sents = self.sent_tokenizer.tokenize(txt)
            #            sents = self.opennlp_tokenizer.sentenize(text)

            # Tokenize each sentence into words (and save line number indices)
            toks = []
            gold = []  # Actual lines

            for s in sents:
                gold.append(s)

                # Store data

                toks = self.word_tokenizer.tokenize(s)
                #                toks = self.opennlp_tokenizer.tokenize(s)

                self.data.append(toks)

                # Keep track of which indices each line has
                end = start + len(s)

                self.line_inds.append((start, end))
                start = end + 1

                # Skip ahead to next non-whitespace
                while (start < len(text)) and text[start].isspace():
                    start += 1

        # If an accompanying concept file was specified, read it
        if con:
            classifications = []
            with open(con) as f:
                for line in f:

                    # Empty line
                    if line == '\n': continue

                    # Parse concept file line
                    fields = line.strip().split('||')
                    #print fields
                    concept = fields[0]
                    span_inds = []
                    for i in range(1, len(fields), 2):
                        span = int(fields[i]), int(fields[i + 1])
                        span_inds.append(span)

                    #print '\t', concept
                    #print '\t', span_inds

                    classifications.append((concept, span_inds))

            # Concept file does not guarantee ordering by line number
            self.classifications = sorted(classifications, cmp=concept_cmp)

    def read(self, txt, con=None):
        #        print "semeval note read called"

        # Filename
        self.fileName = txt

        #        print self.fileName

        start = 0
        end = 0
        with open(txt) as f:

            # Get entire file
            original_text = f.read()
            text = remove_non_ascii(original_text)

            #            print "original text:"
            #            print original_text

            #            print "text with ascii removed:"
            #            print text

            #print "\nTEXT:------------------"
            #print text

            self.text = text

            # Sentence splitter

            sents = self.sent_tokenizer.tokenize(txt, "semeval")
            #            sents = self.opennlp_tokenizer.sentenize(text)

            #            print "sentenized text: "
            #            print sents

            #print "\nSENTS:-----------------------------"
            #            for line in sents:
            #                print line

            # Tokenize each sentence into words (and save line number indices)
            toks = []
            gold = []  # Actual lines

            i = 0
            for s in sents:
                i += 1

                gold.append(s)

                b = False
                #if b: print "\nsentence:-------------------------------"
                #if b: print '<s>' + s + '</s>'

                #print s

                # Store data

                toks = self.word_tokenizer.tokenize(s, "semeval")
                #    toks = self.opennlp_tokenizer.tokenize(s)

                #print toks

                #if b: print "\ntokenized sentence:----------------------------"
                #if b: print toks

                self.data.append(toks)

                #                print self.data

                # Keep track of which indices each line has
                end = start + len(s)

                #if b: print "\nindices:---------------------------------------"
                #if b: print (start, end)

                #if b: print "\nusing index on entire txt----------------------"
                #if b: print '<s>' + text[start:end] + '</s>'

                # EQUAL?
                # assert( text[start:end] == s ), 'data and text must agree'

                self.line_inds.append((start, end))
                start = end

                # Skip ahead to next non-whitespace
                while (start < len(text)) and text[start].isspace():
                    start += 1
            '''
            for line,inds in zip(gold,self.line_inds):
                print '!!!' + line + '!!!'
                print '\t', 'xx'*10
                print inds
                print '\t', 'xx'*10
                print '!!!' + text[inds[0]: inds[1]] + '!!!'
                print '---'
                print '\n'
                print 'Xx' * 20
            '''

#        print "tokenized data: "
#        print self.data

#        print "tokens, one per line: "
#        for token in [token for l in self.data for token in l]:

#            print token

#        print "\n\n"

# If an accompanying concept file was specified, read it
        if con:

            classifications = []
            with open(con) as f:
                for line in f:

                    #                    print line

                    # Empty line
                    if line == '\n': continue

                    # Parse concept file line
                    fields = line.strip().split('|')
                    #print fields

                    cui = fields[2]
                    span_inds = []

                    spans = fields[1]
                    spans = spans.split(',')
                    spans = [s.split('-') for s in spans]

                    #                    print spans
                    for span in spans:
                        span = int(span[0]), int(span[1])
                        span_inds.append(span)

                # print span_inds

                #for i in range(3,len(fields),2):
                #    span = int(fields[i]), int(fields[i+1])
                #    span_inds.append( span )

                # Everything is a Disease_Disorder
                    concept = 'problem'

                    #                    if len(spans) == 1:
                    classifications.append((concept, span_inds))


#                    else:
#                        print "skipping > 1"

# Safe guard against concept file having duplicate entries
            classifications = sorted(classifications, cmp=concept_cmp)

            # Hack: Throw away noncontiguous spans that cross line numbers
            newClassifications = []

            #print classifications

            for classification in classifications:
                concept, char_spans = classification

                # Each span (could be noncontiguous span)
                tok_spans = []
                first_lineno = None

                ignore = False
                for span in char_spans:
                    # character offset span --> lineno and list of token index spans
                    lineno, tokspan = lineno_and_tokspan(
                        self.line_inds, self.data, self.text, span, "semeval")
                    tok_spans.append(tokspan)

                    # Ensure all noncontig spans are together on one line
                    if first_lineno == None: first_lineno = lineno

                    # Throw away noncontig spans that cross lines
                    if lineno != first_lineno:
                        ignore = True

                if not ignore:
                    newClassifications.append(classification)

            # Copy changes over
            classifications = newClassifications

            # Hack: Throw away subsumed spans
            # ex. "left and right atrial dilitation" from 02136-017465.text
            classifs = reduce(lambda a, b: a + b,
                              map(lambda t: t[1], classifications))
            classifs = list(set(classifs))
            classifs = sorted(classifs, key=lambda s: s[0])
            #print classifs

            from utilities_for_notes import span_relationship

            newClassifications = []
            for c in classifications:

                ignore = False
                for span in c[1]:
                    #print '\t', span

                    # Slow!
                    # Determine if any part of span is subsumed by other span
                    for cand in classifs:
                        # Don't let identity spans mess up comparison
                        if span == cand: continue

                        # Is current span subsumed?
                        rel = span_relationship(span, cand)
                        if rel == 'subsumes':
                            #print 'SUBSUMED!'
                            ignore = True

                # Only add if no spans are subsumed by others
                if not ignore:
                    newClassifications.append(c)

            #for c in newClassifications: print c
            self.classifications = newClassifications

        #   print self.data

        #for c in newClassifications:
        #    print c
        #exit()

        # Concept file does not guarantee ordering by line number
        #self.classifications = sorted(classifications, cmp=concept_cmp)

    def write(self, labels):

        # Case: User DOES provide predicted annotations (classification tuples)
        if labels != None:
            # Translate token-level annotations to character offsets
            classifications = []
            for classification in labels:
                # Data needed to recover original character offsets
                inds = self.line_inds
                data = self.data
                text = self.text

                # Unpack classification span
                concept = classification[0]
                lno = classification[1] - 1
                tokspans = classification[2]

                # Get character offset span
                spans = []

                #               print tokspans

                for tokspan in tokspans:
                    span = lno_and_tokspan__to__char_span(
                        inds, data, text, lno, tokspan, "semeval")
                    spans.append(span)
                classifications.append((concept, spans))

        elif self.classifications != None:
            classifications = self.classifications
        else:
            raise Exception('Cannot write concept file: must specify labels')

        # Assertion: 'classifications' is a list of (concept,char-span) tups

        # Build output string
        retStr = ''

        # system only covers semeval task 1.
        defaultDisorderSlots = '|no|null|patient|null|no|null|unmarked|null|unmarked|null|false|null|false|null|NULL|null'

        #      print classifications

        # For each classification, format to semeval style
        for concept, span_inds in classifications:

            spansAsStr = ",".join(
                [str(span[0]) + '-' + str(span[1]) for span in span_inds])

            outputLine = "{0}|{1}|{2}".format(self.fileName, spansAsStr,
                                              'CUI-less')

            outputLine += defaultDisorderSlots
            #retStr += self.fileName + '||%s||CUI-less' % concept
            #for span in span_inds:
            #    retStr += '||' + str(span[0]) + "||" +  str(span[1])
            retStr += (outputLine + '\n')

        return retStr[0:-1]
Пример #5
0
######################################################################

__author__ = 'Willie Boag'
__date__ = 'Aug 2, 2015'

import string
import sys
import re
import nltk

from abstract_note import AbstractNote
from utilities_for_notes import concept_cmp, classification_cmp
from utilities_for_notes import lineno_and_tokspan, lno_and_tokspan__to__char_span
from utilities_for_notes import WordTokenizer, SentenceTokenizer

word_tokenizer = WordTokenizer()
sent_tokenizer = SentenceTokenizer()


class Note_plain(AbstractNote):
    def __init__(self):
        # Internal representation natural for i2b2 format
        self.data = []  # list of list of tokens
        self.classifications = []  # list of concept tuples
        self.line_inds = []  # list of (start,end) indices for every line

    def getExtension(self):
        return 'plain'

    def getText(self):
        return self.text
Пример #6
0
class Note_semeval(AbstractNote):
    def __init__(self):
        # For parsing text file
        self.sent_tokenizer = SentenceTokenizer()
        self.word_tokenizer = WordTokenizer()

        # Internal representation natural for i2b2 format
        self.text = ''
        self.data = []  # list of list of tokens
        self.line_inds = []
        self.classifications = []
        self.fileName = 'no-file'

    def getExtension(self):
        return 'pipe'

    def getText(self):
        return self.text

    def getTokenizedSentences(self):
        return self.data

    def getClassificationTuples(self):
        return self.classifications

    def getLineIndices(self):
        return self.line_inds

    def read_standard(self, txt, con=None):

        start = 0
        end = 0

        with open(txt) as f:

            # Get entire file
            text = f.read()
            self.text = text

            # Sentence splitter
            sents = self.sent_tokenizer.tokenize(txt)

            # Tokenize each sentence into words (and save line number indices)
            toks = []
            gold = []  # Actual lines

            for s in sents:
                gold.append(s)

                # Store data
                toks = self.word_tokenizer.tokenize(s)
                self.data.append(toks)

                # Keep track of which indices each line has
                end = start + len(s)

                self.line_inds.append((start, end))
                start = end + 1

                # Skip ahead to next non-whitespace
                while (start < len(text)) and text[start].isspace():
                    start += 1

        # If an accompanying concept file was specified, read it
        if con:
            classifications = []
            with open(con) as f:
                for line in f:

                    # Empty line
                    if line == '\n': continue

                    # Parse concept file line
                    fields = line.strip().split('||')
                    #print fields
                    concept = fields[0]
                    span_inds = []
                    for i in range(1, len(fields), 2):
                        span = int(fields[i]), int(fields[i + 1])
                        span_inds.append(span)

                    #print '\t', concept
                    #print '\t', span_inds

                    classifications.append((concept, span_inds))

            # Concept file does not guarantee ordering by line number
            self.classifications = sorted(classifications, cmp=concept_cmp)

    def read(self, txt, con=None):

        # Filename
        self.filename = os.path.split(txt)[1]

        start = 0
        end = 0
        with open(txt) as f:

            # Get entire file
            text = f.read()
            #print "\nTEXT:------------------"
            #print text

            self.text = text

            # Sentence splitter
            sents = self.sent_tokenizer.tokenize(txt)

            #print "\nSENTS:-----------------------------"
            #print sents

            # Tokenize each sentence into words (and save line number indices)
            toks = []
            gold = []  # Actual lines

            for s in sents:

                gold.append(s)

                #print "\nsentence:-------------------------------"
                #print s

                #print s

                # Store data
                toks = self.word_tokenizer.tokenize(s)

                #print "\ntokenized sentence:---------------------------------"
                #print toks

                self.data.append(toks)

                # Keep track of which indices each line has
                end = start + len(s)

                #print "\nindices:--------------------------------------------"
                #print (start, end)

                #print "\nusing index on entire txt----------------------------"
                #print text[start:end]

                #print "\nEQUAL?"
                #print text[start:end] == s

                self.line_inds.append((start, end))
                start = end + 1

                # Skip ahead to next non-whitespace
                while (start < len(text)) and text[start].isspace():
                    start += 1
            '''
            for line,inds in zip(gold,self.line_inds):
                print '!!!' + line + '!!!'
                print '\t', 'xx'*10
                print inds
                print '\t', 'xx'*10
                print '!!!' + text[inds[0]: inds[1]] + '!!!'
                print '---'
                print '\n'
                print 'Xx' * 20
            '''

        #lno,span = lineno_and_tokspan((2329, 2351))
        #lno,span = lineno_and_tokspan((1327, 1344))
        #print self.data[lno][span[0]:span[1]+1]

        # If an accompanying concept file was specified, read it
        if con:
            offset_classifications = []
            classifications = []
            with open(con) as f:
                for line in f:

                    # Empty line
                    if line == '\n': continue

                    # Parse concept file line
                    fields = line.strip().split('||')
                    #print fields
                    concept = fields[1]
                    cui = fields[2]
                    span_inds = []
                    for i in range(3, len(fields), 2):
                        span = int(fields[i]), int(fields[i + 1])
                        span_inds.append(span)

                    #print '\t', concept
                    #print '\t', span_inds

                    # Everything is a Disease_Disorder
                    concept = 'problem'

                    # FIXME - For now, treat non-contiguous spans as separate
                    for span in span_inds:
                        #l,(start,end) = lineno_and_tokspan(span)
                        # Add the classification to the Note object
                        offset_classifications.append(
                            (concept, span[0], span[1]))
                    classifications.append((concept, span_inds))

            # Safe guard against concept file having duplicate entries
            #classifications = list(set(classifications))

            # Concept file does not guarantee ordering by line number
            self.classifications = sorted(classifications, cmp=concept_cmp)

    def write(self, labels):

        # If given labels to write, use them. Default to self.classifications
        if labels != None:
            # Translate token-level annotations to character offsets
            classifications = []
            for classification in labels:
                inds = self.line_inds
                data = self.data
                text = self.text

                # FIXME - Assumes that token-level does not have noncontig
                concept = classification[0]
                lno = classification[1] - 1
                start = classification[2]
                end = classification[3]
                tokspan = start, end

                # Get character offset span
                span = lno_and_tokspan__to__char_span(inds, data, text, lno,
                                                      tokspan)
                classifications.append((concept, span))

        elif self.classifications != None:
            classifications = self.classifications
        else:
            raise Exception('Cannot write concept file: must specify labels')

        exit()

        # return value
        retStr = ''

        for concept, span_inds in classifications:
            retStr += self.fileName + '.text||%s||CUI-less' % concept
            for span in span_inds:
                retStr += '||' + str(span[0]) + "||" + str(span[1])
            retStr += '\n'

        return retStr