예제 #1
0
def read_i2b2(txt, con):
    """
    read_i2b2()

    @param txt. A file path for the tokenized medical record
    @param con. A file path for the i2b2 annotated concepts for txt
    """
    tokenized_sents = []

    sent_tokenize = lambda text: text.split('\n')
    word_tokenize = lambda text: text.split(' ')

    # Read in the medical text
    with open(txt) as f:
        try:
            # Original text file
            text = f.read().strip('\n')
            # text = f.read()
        except:
            f = open(txt, encoding="utf8")
            text = f.read().strip('\n')
            f.close()

        # tokenize
        sentences = sent_tokenize(text)
        # print(sentences)
        '''
        ['DATE OF ADMISSION : MM/DD/YYYY', 'DATE OF DISCHARGE : MM/DD/YYYY', 'DISCHARGE DIAGNOSES :', '1 . Vasovagal syncope , status post fall .', '2 . Traumatic arthritis , right knee .', '3 .
        Hypertension .', '4 ]
        '''
        for sentence in sentences:
            sent = clean_text(sentence.rstrip())

            # lowercase
            sent = sent.lower()

            toks = word_tokenize(sent)

            # normalize tokens
            normed_toks = normalize_tokens(toks)

            #for w in normed_toks:
            #    print(w)
            #print()

            tokenized_sents.append(normed_toks)

    # If an accompanying concept file was specified, read it
    tok_concepts = []
    if con:
        with open(con) as f:
            for line in f.readlines():
                # Empty line
                if not line.strip():
                    continue

                # parse concept line
                concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$'
                match = re.search(concept_regex, line.strip())
                groups = match.groups()

                # retrieve regex info
                concept_text = groups[0]
                start_lineno = int(groups[1])
                start_tok_ind = int(groups[2])
                end_lineno = int(groups[3])
                end_tok_ind = int(groups[4])
                concept_label = groups[5]

                # pre-process text for error-check
                #matching_line = tokenized_sents[start_lineno-1]
                #matching_toks = matching_line[start_tok_ind:end_tok_ind+1]
                #matching_text = ' '.join(matching_toks).lower()
                #concept_text  = ' '.join(word_tokenize(concept_text))

                # error-check info
                assert start_lineno == end_lineno, 'concept must span single line'
                #assert concept_text==matching_text, 'something wrong with inds'

                # add the concept info
                tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind)
                tok_concepts.append(tup)

        # Safe guard against concept file having duplicate entries
        tok_concepts = list(set(tok_concepts))

        # Concept file does not guarantee ordering by line number
        tok_concepts = sorted(tok_concepts, key=lambda t: t[1:])

        # Ensure no overlapping concepts (that would be bad)
        for i in range(len(tok_concepts) - 1):
            c1 = tok_concepts[i]
            c2 = tok_concepts[i + 1]
            if c1[1] == c2[1]:
                if c1[2] <= c2[2] and c2[2] <= c1[3]:
                    fname = os.path.basename(con)
                    error1 = '%s has overlapping entities on line %d' % (fname,
                                                                         c1[1])
                    error2 = "It can't be processed until you remove one"
                    error3 = 'Please modify this file: %s' % con
                    error4 = '\tentity 1: c="%s" %d:%d %d:%d||t="%s"' % (
                        ' '.join(tokenized_sents[c1[1] - 1][c1[2]:c1[3] + 1]),
                        c1[1], c1[2], c1[1], c1[3], c1[0])
                    error5 = '\tentity 2: c="%s" %d:%d %d:%d||t="%s"' % (
                        ' '.join(tokenized_sents[c2[1] - 1][c2[2]:c2[3] + 1]),
                        c2[1], c2[2], c2[1], c2[3], c2[0])
                    error_msg = '\n\n%s\n%s\n\n%s\n\n%s\n%s\n' % (
                        error1, error2, error3, error4, error5)
                    raise DocumentException(error_msg)

    # print(tok_concepts) # ('treatment', 48, 2, 2), ('treatment', 49, 5, 5)]

    return tokenized_sents, tok_concepts
예제 #2
0
def read_i2b2(txt, con):
    """
    read_i2b2()

    @param txt. A file path for the tokenized medical record
    @param con. A file path for the i2b2 annotated concepts for txt
    """
    tokenized_sents = []

    sent_tokenize = lambda text: text.split('\n')
    word_tokenize = lambda text: text.split(' ')

    # Read in the medical text
    with open(txt) as f:
        # Original text file
        text = f.read().strip('\n')

        # tokenize
        sentences = sent_tokenize(text)
        for sentence in sentences:
            sent = clean_text(sentence.rstrip())

            # lowercase
            sent = sent.lower()

            toks = word_tokenize(sent)

            # normalize tokens
            normed_toks = normalize_tokens(toks)

            #for w in normed_toks:
            #    print w
            #print

            tokenized_sents.append(normed_toks)

    # If an accompanying concept file was specified, read it
    tok_concepts = []
    if con:
        with open(con) as f:
            for line in f.readlines():
                # Empty line
                if not line.strip():
                    continue

                # parse concept line
                concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$'
                match = re.search(concept_regex, line.strip())
                groups = match.groups()

                # retrieve regex info
                concept_text = groups[0]
                start_lineno = int(groups[1])
                start_tok_ind = int(groups[2])
                end_lineno = int(groups[3])
                end_tok_ind = int(groups[4])
                concept_label = groups[5]

                # pre-process text for error-check
                #matching_line = tokenized_sents[start_lineno-1]
                #matching_toks = matching_line[start_tok_ind:end_tok_ind+1]
                #matching_text = ' '.join(matching_toks).lower()
                #concept_text  = ' '.join(word_tokenize(concept_text))

                # error-check info
                assert start_lineno == end_lineno, 'concept must span single line'
                #assert concept_text==matching_text, 'something wrong with inds'

                # add the concept info
                tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind)
                tok_concepts.append(tup)

        # Safe guard against concept file having duplicate entries
        tok_concepts = list(set(tok_concepts))

        # Concept file does not guarantee ordering by line number
        tok_concepts = sorted(tok_concepts, cmp=classification_cmp)

        # Ensure no overlapping concepts (that would be bad)
        for i in range(len(tok_concepts) - 1):
            c1 = tok_concepts[i]
            c2 = tok_concepts[i + 1]
            if c1[1] == c2[1]:
                if c1[2] <= c2[2] and c2[2] <= c1[3]:
                    fname = os.path.basenme(con)
                    error1 = '%s has overlapping entities on line %d' % (fname,
                                                                         c1[1])
                    error2 = "It can't be processed until you remove one"
                    error_msg = '%s\n%s' % (error1, error2)
                    raise DocumentException(error_msg)

    return tokenized_sents, tok_concepts
예제 #3
0
def readDocs(txt, concept):
    tokenizedSentences = []
    sentTokenize = lambda text: text.split('\n')
    wordTokenize = lambda text: text.split(' ')

    with open(txt) as foo:
        text = foo.read().strip('\n')
        sentences = sentTokenize(text)
        for s in sentences:
            sent = clean_text(s.rstrip())
            sent = sent.lower()
            tokens = wordTokenize(sent)
            normedTokens = normalize_tokens(tokens)
            tokenizedSentences.append(normedTokens)

    tokenizedConcepts = []
    if concept:
        with open(concept) as foo:
            for l in foo.readlines():
                if not l.strip():
                    continue

                conceptRegex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$'
                match = re.search(conceptRegex, l.strip())
                groups = match.groups()

                concept_text = groups[0]
                beginLineNum = int(groups[1])
                beginTokenIndex = int(groups[2])
                lastLineNum = int(groups[3])
                lastTokenIndex = int(groups[4])
                conceptLabel = groups[5]

                assert beginLineNum == lastLineNum, 'concept must span single line'

                tup = (conceptLabel, beginLineNum, beginTokenIndex,
                       lastTokenIndex)
                tokenizedConcepts.append(tup)

        tokenizedConcepts = list(set(tokenizedConcepts))
        tokenizedConcepts = sorted(tokenizedConcepts, key=lambda t: t[1:])

        # Ensure no overlapping concepts (that would be bad)
        for i in range(len(tokenizedConcepts) - 1):
            c1 = tokenizedConcepts[i]
            c2 = tokenizedConcepts[i + 1]
            if c1[1] == c2[1]:
                if c1[2] <= c2[2] and c2[2] <= c1[3]:
                    fname = os.path.basename(con)
                    error1 = '%s has overlapping entities on line %d' % (fname,
                                                                         c1[1])
                    error2 = "It can't be processed until you remove one"
                    error3 = 'Please modify this file: %s' % con
                    error4 = '\tentity 1: c="%s" %d:%d %d:%d||t="%s"' % (
                        ' '.join(
                            tokenizedSentences[c1[1] - 1][c1[2]:c1[3] + 1]),
                        c1[1], c1[2], c1[1], c1[3], c1[0])
                    error5 = '\tentity 2: c="%s" %d:%d %d:%d||t="%s"' % (
                        ' '.join(
                            tokenizedSentences[c2[1] - 1][c2[2]:c2[3] + 1]),
                        c2[1], c2[2], c2[1], c2[3], c2[0])
                    error_msg = '\n\n%s\n%s\n\n%s\n\n%s\n%s\n' % (
                        error1, error2, error3, error4, error5)
                    raise DocumentException(error_msg)

    return tokenizedSentences, tokenizedConcepts
예제 #4
0
파일: documents.py 프로젝트: wboag/CliNER
def read_i2b2(txt, con):
    """
    read_i2b2()

    @param txt. A file path for the tokenized medical record
    @param con. A file path for the i2b2 annotated concepts for txt
    """
    tokenized_sents = []

    sent_tokenize = lambda text: text.split('\n')
    word_tokenize = lambda text: text.split(' ')

    # Read in the medical text
    with open(txt) as f:
        # Original text file
        text = f.read().strip('\n')

        # tokenize
        sentences = sent_tokenize(text)
        for sentence in sentences:
            sent = clean_text(sentence.rstrip())

            # lowercase (like word2vec preprocessing)
            sent = sent.lower()

            toks = word_tokenize(sent)

            # normalize tokens
            normed_toks = normalize_tokens(toks)

            #for w in normed_toks:
            #    print w
            #print

            tokenized_sents.append(normed_toks)

    # If an accompanying concept file was specified, read it
    tok_concepts = []
    if con:
        with open(con) as f:
            for line in f.readlines():
                # Empty line
                if not line.strip():
                    continue

                # parse concept line
                concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$'
                match = re.search(concept_regex, line.strip())
                groups = match.groups()

                # retrieve regex info
                concept_text  =     groups[0]
                start_lineno  = int(groups[1])
                start_tok_ind = int(groups[2])
                end_lineno    = int(groups[3])
                end_tok_ind   = int(groups[4])
                concept_label =     groups[5]

                # pre-process text for error-check
                #matching_line = tokenized_sents[start_lineno-1]
                #matching_toks = matching_line[start_tok_ind:end_tok_ind+1]
                #matching_text = ' '.join(matching_toks).lower()
                #concept_text  = ' '.join(word_tokenize(concept_text))

                # error-check info
                assert start_lineno==end_lineno, 'concept must span single line'
                #assert concept_text==matching_text, 'something wrong with inds'

                # add the concept info
                tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind)
                tok_concepts.append(tup)

        # Safe guard against concept file having duplicate entries
        tok_concepts = list(set(tok_concepts))

        # Concept file does not guarantee ordering by line number
        tok_concepts = sorted(tok_concepts, cmp=classification_cmp)

        # Ensure no overlapping concepts (that would be bad)
        for i in range(len(tok_concepts)-1):
            c1 = tok_concepts[i]
            c2 = tok_concepts[i+1]
            if c1[1] == c2[1]:
                if c1[2] <= c2[2] and c2[2] <= c1[3]:
                    fname = os.path.basenme(con)
                    error1='%s has overlapping entities on line %d'%(fname,c1[1])
                    error2="It can't be processed until you remove one"
                    error_msg = '%s\n%s' % (error1,error2)
                    raise DocumentException(error_msg)

    return tokenized_sents, tok_concepts