def get_file_sentences_and_idf(filename):

	''' Get sentences from the document 
	    input: filename
	    output:sentenceList'''

	global sentenceList
	global stemmedList
	global n
	doc=open(filename,'r')

	# extract the content in the file and store it in data
	data = "".join(line.rstrip() for line in doc) 

	# lower case the words
	data = data.lower()                        	  

	# Use BeautifulSoup to read xml format
	soup = BeautifulSoup(data, 'html.parser')	

	# Extract the text from the respective tags  
	try:                                          
		text = soup.find("text").get_text()
	except AttributeError:
	    text = ""
	sentenceList=('\n'.join(tokenizer.tokenize(text))).split('\n')
	n = len(sentenceList)
	calculate_idf()
示例#2
0
文件: W2v.py 项目: saridsa1/cdc
def Tokenization(data, concept, stem, removeStopwords):
    if concept == False:
        data = BeautifulSoup(data).get_text()
        data = re.sub("\r\n", " ", data)
        data = re.sub("[^a-zA-Z0-9_]", " ", data)
        data = data.lower()
    if stem == True:
        stemmer = PorterStemmer()
        data = stemmer.stem(data)
    words = data.split()
    if removeStopwords == True:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words
示例#3
0
 def GetDataFeatures(self, data, dictionary):
     data_features = {
         word.lower(): (word in word_tokenize(data.lower()))
         for word in dictionary
     }
     return data_features
示例#4
0
def analyze():
    ldict = {}
    result = []

    rdict.clear()
    term_dict.clear()

    if (len(btn_list) > 0):
        for btn in btn_list:
            btn.destroy()
        btn_list.clear()

    if (len(lbox_list) > 0):
        for lbox in lbox_list:
            lbox.destroy()
        lbox_list.clear()

    with open('conditions.csv', newline='') as f:
        reader = csv.reader(f)
        data = list(reader)

        for line in data:
            #line[1] = line[1].replace(" ","")
            ldict[line[0]] = line[1:][0].split(',')

    data = text_box.value
    data_lower = data.lower()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    if (preprocess_checkbox.value == 0):
        remove_history = re.sub('past medical history: diagnosis.*?drug use',
                                ' drug use ',
                                data_lower,
                                flags=re.DOTALL)
        remove_physicala = re.sub('physical exam.*?assessment/plan',
                                  '. ASSESSMENT',
                                  remove_history,
                                  flags=re.DOTALL)
        remove_complete = re.sub('physical exam.*?assessment and plan',
                                 '. ASSESSMENT',
                                 remove_physicala,
                                 flags=re.DOTALL)
        sentence_list = tokenizer.tokenize(remove_complete)

    else:
        sentence_list = tokenizer.tokenize(data_lower)

    for sentence in sentence_list:
        for (key, terms) in ldict.items():
            for term in terms:
                fixed_word = term.lower()
                # If have a special date character $date$
                # Insert the found dates into the ldict array as a term
                if fixed_word == "$date$":
                    ldict[key].extend(date_match)
                    continue
                if fixed_word in sentence:
                    dup_check = [key, sentence]
                    # If a sub term comes up we save it
                    if (sentence, key) in term_dict:
                        term_dict[(sentence, key)].append(fixed_word)
                    else:
                        term_dict[(sentence, key)] = [fixed_word]
                    if dup_check not in result:
                        result.append([key, sentence])
                        if key in rdict:
                            rdict[key].append(sentence)
                        else:
                            rdict[key] = [sentence]

    lbox_list.append(
        ListBox(form_box,
                items=sentence_list,
                width="fill",
                height=resolution[rt]['lbox_height'],
                command=dispay_full,
                multiselect=True,
                scrollbar=True))
    lbox_list[0].bg = "#C8D7E9"

    counter = 0
    btn = ""

    for (k, vl) in ldict.items():

        if (counter < 4):
            if k in rdict:
                if (platform != 'win32'):
                    btn = PushButton(button_box_r1,
                                     align="left",
                                     width="10",
                                     text=str("█" + k),
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                else:
                    btn = PushButton(button_box_r1,
                                     align="left",
                                     width="10",
                                     text=k,
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                btn.update_command(pval, [k, lbox_list[0]])
                btn.bg = "#9EF844"
            else:
                btn = PushButton(button_box_r1,
                                 align="left",
                                 width="10",
                                 text=k,
                                 pady=resolution[rt]['button_pady'])
                btn.bg = "#F84446"
        elif (counter < 8):
            if k in rdict:

                if (platform != 'win32'):
                    btn = PushButton(button_box_r2,
                                     align="left",
                                     width="10",
                                     text=str("█" + k),
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                else:
                    btn = PushButton(button_box_r2,
                                     align="left",
                                     width="10",
                                     text=k,
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                btn.update_command(pval, [k, lbox_list[0]])
                btn.bg = "#9EF844"
            else:
                btn = PushButton(button_box_r2,
                                 align="left",
                                 width="10",
                                 text=k,
                                 pady=resolution[rt]['button_pady'])
                btn.bg = "#F84446"
        elif (counter < 12):

            if k in rdict:

                if (platform != 'win32'):
                    btn = PushButton(button_box_r3,
                                     align="left",
                                     width="10",
                                     text=str("█" + k),
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                else:
                    btn = PushButton(button_box_r3,
                                     align="left",
                                     width="10",
                                     text=k,
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                btn.update_command(pval, [k, lbox_list[0]])
                btn.bg = "#9EF844"
            else:
                btn = PushButton(button_box_r3,
                                 align="left",
                                 width="10",
                                 text=k,
                                 pady=resolution[rt]['button_pady'])
                btn.bg = "#F84446"
        elif (counter < 16):

            if k in rdict:

                if (platform != 'win32'):
                    btn = PushButton(button_box_r4,
                                     align="left",
                                     width="10",
                                     text=str("█" + k),
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                else:
                    btn = PushButton(button_box_r4,
                                     align="left",
                                     width="10",
                                     text=k,
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                btn.update_command(pval, [k, lbox_list[0]])
                btn.bg = "#9EF844"
            else:
                btn = PushButton(button_box_r4,
                                 align="left",
                                 width="10",
                                 text=k,
                                 pady=resolution[rt]['button_pady'])
                btn.bg = "#F84446"
        elif (counter < 20):
            if k in rdict:

                if (platform != 'win32'):
                    btn = PushButton(button_box_r5,
                                     align="left",
                                     width="10",
                                     text=str("█" + k),
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                else:
                    btn = PushButton(button_box_r5,
                                     align="left",
                                     width="10",
                                     text=k,
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                btn.update_command(pval, [k, lbox_list[0]])
                btn.bg = "#9EF844"
            else:
                btn = PushButton(button_box_r5,
                                 align="left",
                                 width="10",
                                 text=k,
                                 pady=resolution[rt]['button_pady'])
                btn.bg = "#F84446"
        elif (counter < 24):
            if k in rdict:
                if (platform != 'win32'):
                    btn = PushButton(button_box_r6,
                                     align="left",
                                     width="10",
                                     text=str("█" + k),
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                else:
                    btn = PushButton(button_box_r6,
                                     align="left",
                                     width="10",
                                     text=k,
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                btn.update_command(pval, [k, lbox_list[0]])
                btn.bg = "#9EF844"
            else:
                btn = PushButton(button_box_r6,
                                 align="left",
                                 width="10",
                                 text=k,
                                 pady=resolution[rt]['button_pady'])
                btn.bg = "#F84446"
        elif (counter < 28):
            if k in rdict:
                if (platform != 'win32'):
                    btn = PushButton(button_box_r7,
                                     align="left",
                                     width="10",
                                     text=str("█" + k),
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                else:
                    btn = PushButton(button_box_r7,
                                     align="left",
                                     width="10",
                                     text=k,
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                btn.update_command(pval, [k, lbox_list[0]])
                btn.bg = "#9EF844"
            else:
                btn = PushButton(button_box_r7,
                                 align="left",
                                 width="10",
                                 text=k,
                                 pady=resolution[rt]['button_pady'])
                btn.bg = "#F84446"
        elif (counter < 32):
            if k in rdict:
                if (platform != 'win32'):
                    btn = PushButton(button_box_r8,
                                     align="left",
                                     width="10",
                                     text=str("█" + k),
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                else:
                    btn = PushButton(button_box_r8,
                                     align="left",
                                     width="10",
                                     text=k,
                                     command=pval,
                                     pady=resolution[rt]['button_pady'])
                btn.update_command(pval, [k, lbox_list[0]])
                btn.bg = "#9EF844"
            else:
                btn = PushButton(button_box_r8,
                                 align="left",
                                 width="10",
                                 text=k,
                                 pady=resolution[rt]['button_pady'])
                btn.bg = "#F84446"
        counter += 1
        btn_list.append(btn)