示例#1
0
class ClassificationPipe(object):
    def __init__(self, sentence_rules, target_rules, context_rules):
        self.sentence_segmenter = RuSH(sentence_rules)
        self.targets = get_item_data(target_rules)
        self.modifiers = get_item_data(context_rules)

    def process(self, doc_text):
        sentences = self.sentence_segmenter.segToSentenceSpans(doc_text)

        new_anns = []
        for sentence in sentences:
            start_offset = sentence.begin
            sentence_text = doc_text[sentence.begin:sentence.end].lower()
            m = self.markup_sentence(sentence_text,
                                     modifiers=self.modifiers,
                                     targets=self.targets)
            annotations = self.convertMarkupsAnnotations(
                m, sentence_text, start_offset)
            new_anns.extend(annotations)

        return new_anns

    def markup_sentence(self, sentence, modifiers, targets):
        markup = pyConTextGraph.ConTextMarkup()
        txt = sentence.lower()
        markup.setRawText(txt)
        markup.graph["__txt"] = txt
        markup.graph["__scope"] = (0, len(txt))
        markup.markItems(targets, mode="target")
        markup.markItems(modifiers, mode="modifier")
        markup.pruneMarks()
        markup.dropMarks('Exclusion')
        markup.applyModifiers()
        markup.pruneSelfModifyingRelationships()
        markup.dropInactiveModifiers()
        return markup

    def convertMarkupsAnnotations(self, markups, sentence_text, offset=0):
        annotations = []
        nodes = markups.nodes()
        for n in nodes:
            new_ann = Annotation(
                start_index=offset + n.getSpan()[0],
                end_index=offset + n.getSpan()[1],
                type=n.getCategory()[0],
                spanned_text=sentence_text[n.getSpan()[0]:n.getSpan()[1]],
                ann_id=n.getTagID())
            mods = markups.getModifiers(n)
            if (len(mods) > 0):
                for modifier in mods:
                    new_ann.attributes[modifier.getCategory()
                                       [0]] = modifier.getTagID()

            annotations.append(new_ann)
        return annotations
示例#2
0
class TestRuSH(unittest.TestCase):
    def setUp(self):
        self.rush = RuSH('../conf/rush_rules.tsv')

    def test1(self):
        input_str = 'Can Mr. K check it. Look\n good.\n'
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 19)
        assert (sentences[1].begin == 20 and sentences[1].end == 31)

    def test2(self):
        input_str = 'S/p C6-7 ACDF. No urgent events overnight. Pain control ON. '
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 14)
        assert (sentences[1].begin == 15 and sentences[1].end == 42)
        assert (sentences[2].begin == 43 and sentences[2].end == 59)

    def test3(self):
        input_str = ''' •  Coagulopathy (HCC)    



 •  Hepatic encephalopathy (HCC)    



 •  Hepatorenal syndrome (HCC)    

'''
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 1 and sentences[0].end == 22)
        assert (sentences[1].begin == 31 and sentences[1].end == 62)
        assert (sentences[2].begin == 71 and sentences[2].end == 100)

    def test4(self):
        input_str = 'Delirium - '
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 10)
        pass

    def test5(self):
        input_str = "The patient complained about the TIA \n\n No memory issues. \"I \n\nOrdered the MRI scan.- "
        sentences = self.rush.segToSentenceSpans(input_str)
        assert (sentences[0].begin == 0 and sentences[0].end == 36)
        assert (sentences[1].begin == 39 and sentences[1].end == 85)
        pass

    def printDetails(self, sentences, input_str):
        for i in range(0, len(sentences)):
            sentence = sentences[i]
            print('assert (sentences[' + str(i) + '].begin == ' +
                  str(sentence.begin) + ' and sentences[' + str(i) +
                  '].end == ' + str(sentence.end) + ')')
        # self.printDetails(sentences, input_str)
        pass

    def test6(self):
        input_str = '''The Veterans Aging Cohort Study (VACS) is a large, longitudinal, observational study of a cohort of HIV infected and matched uninfected Veterans receiving care within the VA [2]. This cohort was designed to examine important health outcomes, including cardiovascular diseases like heart failure, among HIV infected and uninfected Veterans.'''
        sentences = self.rush.segToSentenceSpans(input_str)
        self.printDetails(sentences, input_str)
示例#3
0
class Mypipe:
    """PyContextNLP pipeline, sentence_rules, target_rules, context_rules, feature_inference_rule, document_inference_rule"""
    def __init__(self, sentence_rules, target_rules, context_rules,
                 feature_inference_rule, document_inference_rule):

        self.sentence_rules = sentence_rules
        self.target_rules = target_rules
        self.context_rules = context_rules
        self.feature_inference_rule = feature_inference_rule
        self.document_inference_rule = document_inference_rule

        self.sentence_segmenter = RuSH(self.sentence_rules)
        self.feature_inferencer = FeatureInferencer(
            self.feature_inference_rule)
        self.document_inferencer = DocumentInferencer(
            self.document_inference_rule)
        self.targets = get_item_data(self.target_rules)
        self.modifiers = get_item_data(self.context_rules)

    def process(self, doc_text):
        """PyContextNLP, return doc_class, context_doc, annotations, relations"""

        context_doc = pyConTextGraph.ConTextDocument()
        sentences = self.sentence_segmenter.segToSentenceSpans(doc_text)

        for sentence in sentences:

            sentence_text = doc_text[sentence.begin:sentence.end].lower()
            # Process every sentence by adding markup
            m = markup_sentence(sentence_text,
                                modifiers=self.modifiers,
                                targets=self.targets)
            context_doc.addMarkup(m)
            context_doc.getSectionMarkups()
            # print(m)
            # print(context_doc.getXML())

        # convert graphic markups into dataframe
        markups = get_document_markups(context_doc)
        annotations, relations, doc_txt = convertMarkups2DF(markups)
        # display(annotations)
        # display(relations)

        # apply inferences for document classication
        inferenced_types = self.feature_inferencer.process(
            annotations, relations)
        # print('After inferred from modifier values, we got these types:\n '+str(inferenced_types))
        doc_class = self.document_inferencer.process(inferenced_types)
        # print('\nDocument classification: '+ doc_class )

        return doc_class, context_doc, annotations, relations
示例#4
0
class ClinicalRushSentenceTokenizer(object):
    def __init__(self, rules='./rush_rules.tsv'):
        self.rules = rules
        self.rush = RuSH(self.rules)

    def tokenize_sents(self, text):
        #try:
        #    sent_spans = self.rush.segToSentenceSpans(text)
        #except Exception as e:
        #    # Let's try to track down where this is happening in the text
        #    for i in range(int(len(text)/10)):
        #        start = i * 10
        #        end = start + 10
        #        try:
        #            self.rush.segToSentenceSpans(text[start:end])
        #        except Exception as e:
        #            with open('failed_snippet.txt', 'a') as f:
        #                f.write(text[start:end] + '\n')
        #            print("Failed at {}".format(start))
        #            raise e
        #sent_spans = [(s.begin, s.end) for s in sent_spans]
        sent_spans = self.rush.segToSentenceSpans(text)
        return sent_spans
示例#5
0
document_inferencer = DocumentInferencer(document_inference_rule)

targets = get_item_data(target_rules)
modifiers = get_item_data(context_rules)

# Example sentences
#input = 'This is a sentence. It is just a test. I like this sentence.'

input = '''
No vomiting, chest pain, shortness of breath, nausea, dizziness, or chills on arrival.
On operative day three, the patient fever was detected with temperature 101.5 F.
After 3 days no fever was detected.
Patient came back for a follow up, denies fever.
'''

sentences = sentence_segmenter.segToSentenceSpans(input)

# See what the document was splitted into
for sentence in sentences:
    print("Sentence({}-{}):\t{}".format(sentence.begin, sentence.end,
                                        input[sentence.begin:sentence.end]))
    print('\n' + '-' * 100 + '\n')

# initiate a pyConTextGraph to hold the pyConText output
context_doc = pyConTextGraph.ConTextDocument()

for sentence in sentences:
    sentence_text = input[sentence.begin:sentence.end].lower()
    # Process every sentence by adding markup
    m = markup_sentence(sentence_text, modifiers=modifiers, targets=targets)
    context_doc.addMarkup(m)
示例#6
0
class ExtractionPipe(object):
    def __init__(self, sentence_rules, target_rules, between_rules):
        self.sentence_segmenter = RuSH(sentence_rules)
        self.targets = get_item_data(target_rules)
        self.between_rules = between_rules

    def process(self, doc_text):
        sentences = self.sentence_segmenter.segToSentenceSpans(doc_text)

        new_anns = []
        for sentence in sentences:
            start_offset = sentence.begin
            sentence_text = doc_text[sentence.begin:sentence.end].lower()
            m = self.markup_sentence_extract(sentence_text,
                                             targets=self.targets)
            annotations = self.classify_relationships(m, sentence_text,
                                                      start_offset)
            new_anns.extend(annotations)

        return new_anns

    def markup_sentence_extract(self, sentence, targets):
        markup = pyConTextGraph.ConTextMarkup()
        txt = sentence.lower()
        markup.setRawText(txt)
        markup.graph["__txt"] = txt
        markup.graph["__scope"] = (0, len(txt))
        markup.markItems(targets, mode="target")
        markup.pruneMarks()
        markup.dropMarks('Exclusion')
        markup.pruneSelfModifyingRelationships()
        return markup

    def classify_relationships(self, markups, sentence_text, offset=0):
        all_targets = markups.getMarkedTargets()
        annotations = []
        if len(all_targets) > 1:
            for index, target in enumerate(all_targets):
                target_cat = target.getCategory()[0]
                try:
                    future_target = all_targets[index + 1]
                    future_cat = future_target.getCategory()[0]
                except Exception:
                    pass
                if target_cat == 'oxygen_saturation':
                    if future_cat == 'value':
                        start_text_index = target.getSpan()[1]
                        end_text_index = future_target.getSpan()[0]
                        between_text = sentence_text[
                            start_text_index:end_text_index]
                        between_text = between_text.strip()
                        if between_text in self.between_rules:
                            new_ann = Annotation(
                                start_index=offset +
                                future_target.getSpan()[0],
                                end_index=offset + future_target.getSpan()[1],
                                type=future_cat,
                                spanned_text=sentence_text[
                                    future_target.getSpan(
                                    )[0]:future_target.getSpan()[1]],
                                ann_id=future_target.getTagID())
                            annotations.append(new_ann)
        return annotations
示例#7
0
class NLPClassificationSystem:
    def __init__(self):
        #initiate necessary components
        self.target_rules = self.getTargetRegexes()
        self.negation_rules = self.getNegRegexes()
        self.section_rules = self.getSectionRegexes()  # new
        self.target_scores = self.target_score()  # new
        self.sentence_rules = 'KB/rush_rules.tsv'
        self.sentence_segmenter = RuSH(self.sentence_rules)

    def process(self, document):
        # document.text = self.filterSection(document.text) # new
        document_id = document.document_id
        ann_index = 0
        #---------
        #all_sent = sent_tokenize(document.text)
        sentences = self.sentence_segmenter.segToSentenceSpans(document.text)
        #sent_begin = 0
        for sentence in sentences:
            sent = document.text[sentence.begin:sentence.end].lower()
            #---------
            for reg in self.target_rules:
                for match in reg.finditer(sent):
                    ann_id = 'NLP_' + document_id + '_' + str(
                        ann_index
                    )  #str(document_id) if document_id is numeric use this
                    ann_index = ann_index + 1
                    new_annotation = Annotation(
                        start_index=int(match.start() + sentence.begin),
                        end_index=int(match.end() + sentence.begin),
                        type='psy_ann',
                        ann_id=ann_id)
                    new_annotation.spanned_text = match.group()
                    #new_annotation.spanned_text = sent[new_annotation.start_index:new_annotation.end_index]

                    for neg_regex in self.negation_rules:
                        if re.search(neg_regex, sent):
                            new_annotation.attributes["Negation"] = "Negated"

                    document.annotations.append(new_annotation)
            #sent_begin = sent_begin + len(sent)
        return document

    def getTargetRegexes(self):
        target_regexes = []
        with open('./KB/NIMH_target_1116_ax.csv',
                  'r') as f1:  #NIMH_target_1116.csv
            regexes = f1.read().splitlines()
        for reg in regexes:
            if reg.startswith('#'):  # == '#':
                continue
            reg = reg.replace("\"", "")
            target_regexes.append(re.compile(reg, re.IGNORECASE))
        return target_regexes

    def getNegRegexes(self):
        neg_regexes = []
        with open('./KB/NIMH_negation_1116.csv', 'r') as f1:
            regexes = f1.read().splitlines()

        for reg in regexes:
            if reg.startswith('#'):  # == '#':
                continue
            reg = reg.replace("\"", "")
            neg_regexes.append(re.compile(reg, re.IGNORECASE))
        return neg_regexes

    def getSectionRegexes(self):  # new
        section_regexes = []
        with open('./KB/section_1116_ax.csv', 'r') as f1:
            regexes = f1.read().splitlines()

        for reg in regexes:
            if reg.startswith('#'):  # == '#':
                continue
            reg = reg.replace("\"", "")
            section_regexes.append(
                re.compile(
                    reg,
                    re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE))
        return section_regexes

    def filterSection(self, txt):  # new
        txt_list = []
        for reg in self.section_rules:
            for match in reg.finditer(txt):
                txt_list.append(match.group())
        txt_str = '...... '.join(txt_list)
        return txt_str

    def target_score(self):
        ann_target_score = pd.read_csv("./KB/NIMH_target_score_1116.csv",
                                       sep='$')
        score1 = dict()
        for index, row in ann_target_score.iterrows():
            row0 = str(row[0]).lower()
            score = {row0: row[1]}
            score1.update(score)
        return score1
class RBDocumentClassifier(BaseClassifier):
    ready = True

    def __init__(self,
                 targets=None,
                 modifiers=None,
                 feature_inference_rule=None,
                 document_inference_rule=None,
                 rush_rule=None,
                 expected_values=[],
                 save_markups=True):
        self.document_inferencer = DocumentInferencer(document_inference_rule)
        self.feature_inferencer = FeatureInferencer(feature_inference_rule)
        self.conclusions = []
        self.modifiers = modifiers
        self.targets = targets
        self.save_markups = save_markups
        self.expected_values = [value.lower() for value in expected_values]
        self.saved_markups_map = dict()
        self.pyrush = None
        if rush_rule is None or not os.path.isfile(rush_rule):
            rush_rule = ConfigReader.getValue('rush_rules_path')
        if rush_rule is not None and os.path.isfile(rush_rule):
            self.pyrush = RuSH(rush_rule)
        else:
            logMsg(("File not found", os.path.abspath(rush_rule)))
        self.last_doc_name = ''

        if modifiers is not None and targets is not None:
            if isinstance(modifiers, str) and isinstance(targets, str):
                if (modifiers.endswith('.csv') or modifiers.endswith('.tsv') or modifiers.endswith(
                        '.txt') or modifiers.endswith('.yml')) \
                        and (targets.endswith('.csv') or targets.endswith('.tsv') or targets.endswith(
                    '.txt') or targets.endswith('.yml') or targets.startswith('Lex\t')):
                    self.setModifiersTargetsFromFiles(modifiers, targets)
            else:
                self.setModifiersTargets(modifiers, targets)
        RBDocumentClassifier.instance = self

    def setModifiersTargets(self, modifiers, targets):
        self.modifiers = modifiers
        self.targets = targets

    def setModifiersTargetsFromFiles(self, modifiers_file, targets_file):
        self.targets = get_item_data(targets_file)
        self.modifiers = get_item_data(modifiers_file)

    def reset_saved_predictions(self):
        self.saved_markups_map = {}
        self.save_markups = True
        self.expected_value = None

    def predict(self, doc, doc_name='t_m_p.txt'):
        self.last_doc_name = doc_name
        doc_conclusion = self.classify(doc, doc_name)
        if doc_conclusion in self.expected_values:
            return 1
        return 0

    def eval(self, gold_docs):
        import sklearn
        import pandas as pd
        fn_docs = []
        fp_docs = []
        prediction_metrics = []
        gold_labels = [x.positive_label for x in gold_docs.values()]
        pred_labels = []
        logMsg('Start to evaluate against reference standards...')
        for doc_name, gold_doc in gold_docs.items():
            gold_label = gold_doc.positive_label
            pred_label = self.predict(gold_doc.text, doc_name)
            pred_labels.append(pred_label)
            #       Differentiate false positive and false negative error
            if gold_label == 0 and pred_label == 1:
                fp_docs.append(doc_name)
            elif gold_label == 1 and pred_label == 0:
                fn_docs.append(doc_name)

        precision = sklearn.metrics.precision_score(gold_labels, pred_labels)
        recall = sklearn.metrics.recall_score(gold_labels, pred_labels)
        f1 = sklearn.metrics.f1_score(gold_labels, pred_labels)
        # Let's use Pandas to make a confusion matrix for us
        confusion_matrix_df = pd.crosstab(
            pd.Series(gold_labels, name='Actual'),
            pd.Series(pred_labels, name='Predicted'))
        prediction_metrics.append('Precision : {0:.3f}'.format(precision))
        prediction_metrics.append('Recall :    {0:.3f}'.format(recall))
        prediction_metrics.append('F1:         {0:.3f}'.format(f1))

        return fn_docs, fp_docs, '\n'.join(
            prediction_metrics), confusion_matrix_df[[1, 0]].reindex([1, 0])

    def predict_against(self, doc, expected_values, doc_name='t_m_p.txt'):
        doc_conclusion = self.classify(doc, doc_name)
        if doc_conclusion in expected_values:
            return 1
        return 0

    def classify(self, doc, doc_name='t_m_p.txt'):
        self.last_doc_name = doc_name
        if self.modifiers is None or self.targets is None:
            logMsg(
                'DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n'
                +
                'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,'
                + 'targets_file) to set them up.')
        try:
            context_doc = self.markup_context_document(doc, self.modifiers,
                                                       self.targets)
            if self.save_markups and doc_name is not None and len(
                    context_doc.getDocumentGraph().nodes()) > 0:
                self.saved_markups_map[doc_name] = context_doc
            markups = get_document_markups(context_doc)

            annotations, relations, doc_txt = convertMarkups2DF(markups)
            matched_conclusion_types = self.feature_inferencer.process(
                annotations, relations)
            doc_conclusion = self.document_inferencer.process(
                matched_conclusion_types)
        except:
            # pyConText might through errors in some case, will fix it later
            doc_conclusion = self.document_inferencer.default_conclusion
        return doc_conclusion

    def train(self, x, y):
        """just for implement the interface"""
        pass

    def get_last_context_doc(self):
        if self.last_doc_name in self.saved_markups_map:
            return self.saved_markups_map[self.last_doc_name]
        else:
            return None

    def markup_context_document(self, report_text, modifiers, targets):
        context = pyConTextGraph.ConTextDocument()

        # we will use TextBlob for breaking up sentences
        if self.pyrush is None:
            from textblob import TextBlob
            sentences = [s.raw for s in TextBlob(report_text).sentences]
        else:
            sentences = [
                report_text[sentence.begin:sentence.end]
                for sentence in self.pyrush.segToSentenceSpans(report_text)
            ]
        for sentence in sentences:
            m = markup_sentence(sentence, modifiers=modifiers, targets=targets)
            context.addMarkup(m)
            context.getSectionMarkups()

        return context
示例#9
0
12.  Vancomycin 750 mg intravenously twice per day (times 14
days).
13.  Codeine/guaifenesin syrup 5 cc to 10 cc by mouth q.6h.
as needed.
14.  Klonopin 0.75 mg by mouth in the morning and 0.5 mg by
mouth at hour of sleep.
15.  Multivitamin one tablet by mouth once per day.




                          [**Name6 (MD) 2381**] [**Last Name (NamePattern4) 3424**], M.D.  [**MD Number(1) 3425**]

Dictated By:[**Last Name (NamePattern1) 3426**]

MEDQUIST36

D:  [**3399-4-10**]  14:55
T:  [**3399-4-12**]  08:50
JOB#:  [**Job Number 19798**]
4_4788.txt
Open with
Displaying 4_4788.txt.'''

rush = RuSH('conf/rush_rules.tsv')
sentences = rush.segToSentenceSpans(txt)
for sentence in sentences:
    print("Sentence({}-{}):\t{}".format(sentence.begin, sentence.end,
                                        txt[sentence.begin:sentence.end]))
    print('\n' + '-' * 100 + '\n')
示例#10
0
class PreProcessing:
    def __init__(self,
                 annotation_type='SOCIAL_SUPPORT',
                 default_value='no mention',
                 filter_file='conf/keywords_filter.txt',
                 stopwords_file='conf/stop_words.txt',
                 word2vec_file='models/glove.word2vec.txt.bin',
                 rush_rules='conf/rush_rules.tsv',
                 max_token_per_sentence=150):
        # each time we only train/predict a models for one annotation type
        # set an arbitrary max length of sentences, so that we can pad sentences without knowing the max length of sentences in testing set.

        self.max_token_per_sentence = max_token_per_sentence
        self.annotation_type = annotation_type
        self.default_value = default_value
        self.real_max_length = 0
        self.rush = RuSH(rush_rules)
        self.html_tokens_p = re.compile('^\&[a-z]{2,4}\;$')
        self.punctuations = set(string.punctuation)
        # keep '?'
        self.punctuations.remove('?')
        self.spacy_nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])
        self.matcher = None
        self.corpus = None
        keywords_filter = []
        print('load filter keywords')
        # load filter keywords
        if path.isfile(filter_file):
            f = open(filter_file, 'r')
            keywords_filter = [
                line for line in f.readlines() if not line.startswith('#')
            ]
            f.close()
        if len(keywords_filter) > 0:
            self.matcher = matcher.PhraseMatcher(
                self.spacy_nlp.tokenizer.vocab, max_length=6)
            for keyword in keywords_filter:
                self.matcher.add(keyword, None)

        print('load stopwords')
        # load stop words
        if path.isfile(stopwords_file):
            f = open(stopwords_file, 'r')
            self.my_stopwords = set(f.readlines())
            f.close()
        else:
            self.my_stopwords = set(nltk.corpus.stopwords.words('english'))
            f = open(stopwords_file, 'w')
            f.writelines('\n'.join(self.my_stopwords))
            f.close()

        print('load label dictionary')
        self.label_dict = None
        self.label_dict_file = 'models/' + self.annotation_type + '_labels.dict'
        # load dictionary
        if path.isfile(self.label_dict_file):
            self.label_dict = Dictionary.load(self.label_dict_file)

        print('load glove model')
        # self.glove_model = glove2word2vec.smart_open(word2vec_file)
        if path.isfile(word2vec_file):
            if word2vec_file.endswith('.bin'):
                self.glove_model = KeyedVectors.load_word2vec_format(
                    word2vec_file, binary=True)
            else:
                self.glove_model = KeyedVectors.load_word2vec_format(
                    word2vec_file, binary=False)
                print('convert txt model to binary model...')
                self.glove_model.save_word2vec_format(word2vec_file + '.bin',
                                                      binary=True)

        pass

    """ Given a plain text document, return a list of tokenized sentences that contain filter keywords"""

    def processDocument(self,
                        doc_text,
                        tokenized_sentences=[],
                        labels=[],
                        annotations=None,
                        doc_id=None):
        print(doc_id)
        sentences = self.rush.segToSentenceSpans(doc_text)
        sentences_txt = ([
            doc_text[sentence.begin:sentence.end] for sentence in sentences
        ])
        anno_id = 0
        for i in range(0, len(sentences_txt)):
            sentence = sentences_txt[i]
            label = self.default_value
            # if annotations are available, read as labels
            if annotations is not None:
                if len(annotations) > 0:
                    if anno_id < len(annotations) \
                            and annotations[anno_id]['start'] >= sentences[i].begin \
                            and annotations[anno_id]['end'] <= sentences[i].end:
                        label = list(
                            annotations[anno_id]['attributes'].values())[0]
                        anno_id += 1
                    elif anno_id < len(annotations) \
                            and annotations[anno_id]['end'] <= sentences[i].begin:
                        print(doc_id + str(annotations[anno_id]) +
                              'was skipped')
                        i -= 1
                        anno_id += 1

            words = [
                token for token in self.spacy_nlp.make_doc(sentence)
                if len(''.join(ch for ch in token.text
                               if ch not in self.punctuations)) > 0
                and not self.html_tokens_p.search(token.text)
                and not token.text.replace('.', '', 1).isdigit()
                and not token.text.replace('-', '', 1).isdigit()
                and token.text not in self.my_stopwords
            ]
            if self.real_max_length < len(words):
                self.real_max_length = len(words)
            if self.get_matches(words):
                if len(words) < self.max_token_per_sentence:
                    tokenized_sentences.append(
                        self.pad_sentence([word.text for word in words]))
                    labels.append(label)
                else:
                    begin = 0
                    words = [word.text for word in words]
                    while begin <= len(words) - self.max_token_per_sentence:
                        tokenized_sentences.append(
                            words[begin:self.max_token_per_sentence])
                        # overlap the sliced sub-sentences
                        begin += int(self.max_token_per_sentence / 2)
                    if begin < len(words):
                        tokenized_sentences.append(
                            self.pad_sentence(
                                words[len(words) -
                                      self.max_token_per_sentence:]))

        return tokenized_sentences

    def get_matches(self, sentence_tokens):
        if self.matcher is None:
            return True
        matches = self.matcher(sentence_tokens)
        for ent_id, start, end in matches:
            yield (ent_id, start, end)

    # def processLabelledCorpus(self, corpus_dir):
    #     corpus_reader = EhostCorpusReader(corpus_dir)
    #     corpus = corpus_reader.parse()
    #     self.corpus = corpus
    #     tokenized_sentences = []
    #     labels = []
    #     for doc_id, doc in corpus.items():
    #         if self.annotation_type in doc['categorized']:
    #             annotations = [doc['annotations'][anno_id] for anno_id in doc['categorized'][self.annotation_type]]
    #         else:
    #             annotations = []
    #         self.processDocument(doc['text'], tokenized_sentences, labels, annotations, doc_id)
    #
    #     x, y = self.vectorize(tokenized_sentences, labels)
    #     return x, y

    def pad_sentence(self, sentence, padding_word="<PAD/>"):
        """
        Revised from alexander-rakhlin's code
        Pads all sentences to the same length. The length is defined by the longest sentence.
        Returns padded sentences.
        """
        num_padding = self.max_token_per_sentence - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        return new_sentence

    def vectorize(self, sentences, labels=[]):
        """
        Revised from alexander-rakhlin's code, use glove models instead.
        Maps sentencs and labels to vectors based on a vocabulary.
        """

        print(labels)
        if self.label_dict is None:
            self.label_dict = gensim.corpora.Dictionary([set(labels)])
            self.label_dict.compactify()
            self.label_dict.save(self.label_dict_file)
            self.label_dict.save_as_text(self.label_dict_file + '.txt')

        print(set(labels))
        x = np.array([[
            self.glove_model.word_vec(word) if word in self.glove_model.vocab
            else np.random.uniform(-0.25, 0.25, self.glove_model.vector_size)
            for word in sentence
        ] for sentence in sentences])
        y = None
        if len(labels) > 0:
            y = np.zeros((len(labels), len(self.label_dict.keys())))
            for i in range(0, len(labels)):
                label = labels[i]
                y[i][self.label_dict.token2id[label]] = 1

            shuffle_indices = np.random.permutation(np.arange(len(y)))
            x = x[shuffle_indices]
            y = y[shuffle_indices]
        return x, y
示例#11
0
    with open('failed.txt') as f:
        input_str = f.read()

    sent_tokenizer = ClinicalRushSentenceTokenizer('rush_rules.tsv')
    sent_tokenizer = DefaultSentenceTokenizer()
    print(sent_tokenizer.tokenize_sents(input_str))
    #print(sent_tokenizer.span_tokenize(input_str))
    exit()
    print(sent_tokenizer.tokenize_sents(input_str))

    word_tokenizer = TreebankWordTokenizer()
    doc_tokenizer = DocumentTokenizer(rush, word_tokenizer)
    print(doc_tokenizer.tokenize_doc(input_str))
    exit()

    sentences = rush.segToSentenceSpans(input_str)

    #nlp = spacy.load('en_core_web_sm')
    for sentence in sentences[:1]:
        print('Sentence({0}-{1}):\t>{2}<'.format(
            sentence.begin, sentence.end,
            input_str[sentence.begin:sentence.end]))

        text = input_str[sentence.begin:sentence.end]
        print(tokenizer.tokenize(text))
        print(tokenizer.span_tokenize(text))
        spans = tokenizer.span_tokenize(text)
        tokens = tokenizer.tokenize(text)
        for span, token in zip(spans, tokens):
            print(span, token)
            assert (text[span[0]:span[1]] == token)
示例#12
0
 def pyRuSHSplitter(self, text):
     rush = RuSH(ConfigReader.getValue('rush_rules_path'))
     sentences = rush.segToSentenceSpans(text)
     return [
         text[sentence.begin:sentence.end].strip() for sentence in sentences
     ]