Пример #1
0
    def setUp(self):
        # create a sample image in memory
        self.context = pyConText.pyConText()
        self.splitter = helpers.sentenceSplitter()

        self.su1 = u'kanso <Diagnosis>**diabetes**</Diagnosis> utesl\xf6t eller diabetes men inte s\xe4kert. Vi siktar p\xe5 en r\xf6ntgenkontroll. kan det vara nej panik\xe5ngesten\n?'
        self.su2 =  u'IMPRESSION: 1. LIMITED STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.'
        self.su3 = u'This is a sentence that does not end with a number. But this sentence ends with 1.'
Пример #2
0
    def setUp(self):
        # create a sample image in memory
        self.context = pyConText.pyConText()
        self.splitter = helpers.sentenceSplitter()

        self.su1 = u'kanso <Diagnosis>**diabetes**</Diagnosis> utesl\xf6t eller diabetes men inte s\xe4kert. Vi siktar p\xe5 en r\xf6ntgenkontroll. kan det vara nej panik\xe5ngesten\n?'
        self.su2 = u'IMPRESSION: 1. LIMITED STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.'
        self.su3 = u'This is a sentence that does not end with a number. But this sentence ends with 1.'
Пример #3
0
    def setUp(self):
        # create a sample image in memory
        self.context = pyConText.ConTextMarkup()
        self.splitter = helpers.sentenceSplitter()

        self.su1 = u'kanso <Diagnosis>**diabetes**</Diagnosis> utesl\xf6t eller diabetes men inte s\xe4kert. Vi siktar p\xe5 en r\xf6ntgenkontroll. kan det vara nej panik\xe5ngesten\n?'
        self.su2 =  u'IMPRESSION: 1. LIMITED STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.'
        self.su3 = u'This is a sentence that does not end with a number. But this sentence ends with 1. So this should be recognized as a third sentence.'
        self.su4 = u'This is a sentence with a numeric value equal to 1.43 and should not be split into two parts.'
        self.items = [ [u"pulmonary embolism",u"PULMONARY_EMBOLISM",ur"""pulmonary\s(artery )?(embol[a-z]+)""",""],["no gross evidence of","PROBABLE_NEGATED_EXISTENCE","","forward"]]
        self.itemData = itemData.itemData()
        for i in self.items:
            cit = itemData.contextItem
Пример #4
0
    def setUp(self):
        # create a sample image in memory
        self.context = pyConText.ConTextMarkup()
        self.splitter = helpers.sentenceSplitter()

        self.su1 = u'kanso <Diagnosis>**diabetes**</Diagnosis> utesl\xf6t eller diabetes men inte s\xe4kert. Vi siktar p\xe5 en r\xf6ntgenkontroll. kan det vara nej panik\xe5ngesten\n?'
        self.su2 = u'IMPRESSION: 1. LIMITED STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.'
        self.su3 = u'This is a sentence that does not end with a number. But this sentence ends with 1.'
        self.items = [[
            u"pulmonary embolism", u"PULMONARY_EMBOLISM",
            ur"""pulmonary\s(artery )?(embol[a-z]+)""", ""
        ], [
            "no gross evidence of", "PROBABLE_NEGATED_EXISTENCE", "", "forward"
        ]]
        self.itemData = itemData.itemData()
        for i in self.items:
            cit = itemData.contextItem
Пример #5
0
def main(directory, N):
    print(
        f"The path to the file directory specified is {directory} with an N value of {N}."
    )
    print(
        '----------------------------------------------------------------------------------------------------------'
    )

    # Read in multiple files from directory with glob utility.
    name_map, all_data = helpers.globber(directory=directory)

    # Create frequency table as a python dict to analyze all_data and determine the frequency of every word.
    freq_df = helpers.frequencyCounter(data=all_data)

    # Eliminate stop words from frequency table.
    freq_df = helpers.stopWordRemover(file='stop-word-list.csv',
                                      frequency_table=freq_df)

    # Convert all_data from a list to a pandas dataframe.
    all_data = pd.DataFrame(all_data, columns=['text'])

    # Merge the name_map that holds all of the truncated names of the documents and the strings of the parsed documents
    # in all_data to form a comprehensive all_data pandas dataframe.
    all_data = name_map.join(all_data)

    # Create split_sentences data structure to hold every document and every sentence the document contains.
    split_sentences = helpers.sentenceSplitter(data=all_data)

    # Output configuration.
    output = helpers.outputConstructor(freq_df=freq_df,
                                       split_sentences=split_sentences,
                                       data=all_data,
                                       n=N)

    # Extract output to .csv file in the specified directory.
    output.to_csv('output.csv')
Пример #6
0
 def test_sentenceSplitter2(self):
     """test whether we properly skip numbers with decimal points."""
     splitter = helpers.sentenceSplitter()
     sentences = splitter.splitSentences(self.su4)
     assert len(sentences) == 1
Пример #7
0
 def test_sentenceSplitter1(self):
     """test whether we properly capture text that terminates without a recognized sentence termination"""
     splitter = helpers.sentenceSplitter()
     sentences = splitter.splitSentences(self.su3)
     assert len(sentences) == 3
Пример #8
0
 def test_createSentenceSplitter(self):
     assert helpers.sentenceSplitter()
Пример #9
0
 def test_sentenceSplitter1(self):
     """test whether we properly capture text that terminates without a recognized sentence termination"""
     splitter = helpers.sentenceSplitter()
     sentences = splitter.splitSentences(self.su3)
     assert len(sentences) == 2
Пример #10
0
 def test_createSentenceSplitter(self):
     assert helpers.sentenceSplitter()
Пример #11
0
 def test_sentenceSplitter2(self):
     """test whether we properly skip numbers with decimal points."""
     splitter = helpers.sentenceSplitter()
     sentences = splitter.splitSentences(self.su4)
     assert len(sentences) == 1