# rnn.compareSentences("There is a intracranial haemorrhage","There is a haemorrhage in the cranium") # rnn.compareSentences("There is no intracranial haemorrhage","There is a haemorrhage in the cranium") # rnn.compareSentences("There is a intracranial haemorrhage","The study is within normal limits") # rnn.compareSentences("There is a intracranial haemorrhage.","There is a haemorrhage in the cranium.") # rnn.compareSentences("There is no intracranial haemorrhage.","There is a haemorrhage in the cranium.") # rnn.compareSentences("There is a intracranial haemorrhage.","The study is within normal limits.") # rnn.nextWords("VENTRICULAR CALIBRE IS WITHIN NORMAL LIMITS FOR AGE AND IT IS") # rnn.nextWords("VENTRICULAR CALIBRE IS WITHIN NORMAL LIMITS FOR AGE") # rnn.nextWords("NO INTRACEREBRAL HAEMATOMA OR") # rnn.nextWords("left sided embolus") # rnn.reportsToDense() # rnn.buildReportRNN(epochs=180) # rnn.buildReportRNN(epochs=20,continueTraining=True) # rnn.reportToEncoder() # rnn.reports2vecs() # generateReports.labelClassificationRNN() # generateReports.labelClassificationRNN(learn=False) print("loading reports") reports = preprocess.getReports() print("loaded reports") print("report 1:") print(reports[300]) print("report 2:") print(reports[3000]) print(rnn.compareReportSentences(reports[300],reports[3000]))
def runReportSimilarity(fileName, threshold=0.9, reportType="lsi"): """ Assumes reports have FINDINGS: or REPORT: """ fileText = [row.rstrip("\n") for row in open(fileName)] wordsToFind = ["FINDINGS:", "REPORT:"] report1 = fileText[0] report2 = fileText[1] startLoc1 = -1 # startLoc2 = -1 for word in wordsToFind: if startLoc1 == -1 and report1.find(word) != -1: startLoc1 = report1.find(word) + len(word) # if startLoc2 == -1 and report2.find(word) != -1: # startLoc2 = report2.find(word)+len(word) sCom = [] report1 = report1[startLoc1:] sentences1 = rnn.splitIntoSentences(report1) sentences2 = rnn.splitIntoSentences(report2) # sentences1 = rnn.textPreprocess(report1) # sentences2 = rnn.textPreprocess(report2) # sentences1 = report1.split('.') # sentences2 = report2.split('.') sent1 = sentences1[:] sent2 = sentences2[:] if reportType == "lsi": # report2 = report2[startLoc2:] report1 = search_engine.textPreprocess(report1) report1 = search_engine.getDerivations(report1) report2 = search_engine.textPreprocess(report2) report2 = search_engine.getDerivations(report2) for i in range(len(sentences1)): sentences1[i] = search_engine.textPreprocess(sentences1[i]) sentences1[i] = search_engine.getDerivations(sentences1[i]) for i in range(len(sentences2)): sentences2[i] = search_engine.textPreprocess(sentences2[i]) sentences2[i] = search_engine.getDerivations(sentences2[i]) # corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') tfidf_model = gensim.models.TfidfModel.load("./model_files/reports.tfidf_model") lsi_model = gensim.models.LsiModel.load("./model_files/reports.lsi_model") dictionary = gensim.corpora.Dictionary.load("./model_files/reports.dict") vec_lsi1 = lsi_model[tfidf_model[dictionary.doc2bow(report1)]] vec_lsi2 = lsi_model[tfidf_model[dictionary.doc2bow(report2)]] sen1Corp = [dictionary.doc2bow(sent) for sent in sentences1] sen2Corp = [dictionary.doc2bow(sent) for sent in sentences2] vec_lsis1 = lsi_model[tfidf_model[sen1Corp]] vec_lsis2 = lsi_model[tfidf_model[sen2Corp]] # print corpus.num_terms # ind = gensim.similarities.MatrixSimilarity(vec_lsis1,num_features=corpus.num_terms) ind = gensim.similarities.MatrixSimilarity(vec_lsis1, num_features=10) # similarity table for i in vec_lsis2: sCom.append(ind[i]) elif reportType == "rnn": sCom = rnn.compareReportSentences(report1, report2) sCom2 = [] for i in range(len(sCom[0])): row = [] for j in range(len(sCom)): row.append(sCom[j][i]) sCom2.append(row) sCom = sCom2 missing = [0 for s in sent1] # obtain correct sentence i = 0 output = {"missing": 0, "corrections": 0, "extras": 0, "correct": 0} for col in sCom: # for col in range(len(sCom[0])) # for col in sent2: aboveTopThreshold = False j = 0 bestSim = 0 for sim in col: if sim > threshold: aboveTopThreshold = True if sim > bestSim: bestSim = sim if missing[j] < sim: missing[j] = sim j += 1 if aboveTopThreshold: # maybe add percentage for debugging # sent2[i] = " ".join([k for k in sent2[i]]) s = "n\t" + sent2[i] + "\t" output["correct"] += 1 print s else: # sent2[i] = " ".join([k for k in sent2[i]]) s = "e\t" + sent2[i] + "\t" output["extras"] += 1 print s i += 1 i = 0 for k in missing: if k <= threshold: # sent1[i] = " ".join([k for k in sent1[i]]) # s = str(k) s = "m\t" + sent1[i] + "\t" output["missing"] += 1 print s i += 1 return output """