def searchEngineTest(model, searchTerm): print("Search: " + searchTerm) reports = preprocess.getReports() similarReports = search(model,5,searchTerm) if (similarReports == []): print ("ERROR: Invalid search term") for reportIdx in similarReports: print("----------") print("Report #: " + str(reportIdx[0]) + " Similarity: " + str(reportIdx[1]) ) print(reports[reportIdx[0]])
def preprocessReports(fileNames=REPORT_FILES): allReports = [] allSentences = [] for j in range(len(fileNames)): reports = preprocess.getReports([fileNames[j]]) print("loading finished") for i in xrange(len(reports)): reports[i] = textPreprocess(reports[i]) allSentences = allSentences + reports[i] if (i%100==0): print (i / len(reports) * 100) print("preprocessing finished") allReports = allReports + reports file = open('./model_files/reports_full', 'w') pickle.dump(allReports, file) file.close() print("reports saved") file = open('./model_files/reports_sentences_full', 'w') pickle.dump(allSentences, file) file.close() print("sentences saved")
def labelClassificationRNN(learn=True): if learn: c_vals = [[0.001, 0.001, 0.001, 0.001]] c_vals = [[0.005, 0.005, 0.005, 0.005]] c_vals.append([0.01, 0.01, 0.01, 0.01]) c_vals.append([0.05, 0.05, 0.05, 0.05]) c_vals.append([0.1, 0.1, 0.1, 0.1]) c_vals.append([0.5, 0.5, 0.5, 0.5]) c_vals.append([1, 1, 1, 1]) optimal_c = [[0, 0, 0, 0]] else: file = open('./model_files/svm_c_values.pkl', 'r') c_vals = pickle.load(file) optimal_c = c_vals file.close() reports = preprocess.getReports() reportVectors = rnn.loadReportVecs() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(reportVectors[i][:]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. # count = 0 # deletes = [] # for x in range(len(labels)): # if (labels[x] == "negative"): # count = count + 1 # deletes.append(x) # if (count == (len(labels)-(list(labels).count("positive"))*2)): # break # labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) # labels = np.delete(labels,deletes) ################## best_area_cv = -1 for c_value in c_vals: for n in range(numFolds): # split training and test data train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split( labelledCorpus, labels, test_size=0.15) # Split of the last 20% of training set for cross validation cv_labelledCorpus = train_labelledCorpus[ int(0.8 * len(train_labelledCorpus)):] train_labelledCorpus = train_labelledCorpus[:int( 0.8 * len(train_labelledCorpus))] cv_labels = train_labels[int(0.8 * len(train_labels)):] train_labels = train_labels[:int(0.8 * len(train_labels))] # build classifier classifier = svm.SVC(C=c_value[j], kernel='linear').fit( train_labelledCorpus, train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_cv = classifier.predict(cv_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function( test_labelledCorpus) output_scores_train = classifier.decision_function( train_labelledCorpus) output_scores_cv = classifier.decision_function( cv_labelledCorpus) if n == 0: all_test_labels = tuple(test_labels) all_output_scores_test = tuple(output_scores_test) all_cv_labels = tuple(cv_labels) all_output_scores_cv = tuple(output_scores_cv) all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + tuple(test_labels) all_output_scores_test = all_output_scores_test + tuple( output_scores_test) all_cv_labels = all_cv_labels + tuple(cv_labels) all_output_scores_cv = all_output_scores_cv + tuple( output_scores_cv) all_train_labels = all_train_labels + tuple( train_labels) all_output_scores_train = all_output_scores_train + tuple( output_scores_train) # save result for fold to file for r in range(len(test_labels)): reportIdx = corpusList.index( list(test_labelledCorpus[r])) writer.writerow("With c value: " + str(c_value[j])) writer.writerow([ output_scores_test[r], output_test[r], test_labels[r] ]) writer.writerow([labelledReports[reportIdx]]) # generate the roc curve fp_test, tp_test, _ = roc_curve(all_test_labels, all_output_scores_test, pos_label="positive") fp_cv, tp_cv, _ = roc_curve(all_cv_labels, all_output_scores_cv, pos_label="positive") fp_train, tp_train, _ = roc_curve(all_train_labels, all_output_scores_train, pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_cv = auc(fp_cv, tp_cv) area_train = auc(fp_train, tp_train) # Store c value,tps, fps and aucs if cv auc is new best if area_cv > best_area_cv: optimal_c[0][j] = c_value[j] best_fp_test = fp_test best_tp_test = tp_test best_fp_cv = fp_cv best_tp_cv = tp_cv best_fp_train = fp_train best_tp_train = tp_train best_area_test = area_test best_area_cv = area_cv best_area_train = area_train # initialise and plot the average ROC curves for optimal c value name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC: c value of " + str(optimal_c[0][j])) plt.plot(best_fp_test, best_tp_test, 'b', label='test(area = %0.2f)' % best_area_test) plt.plot(best_fp_cv, best_tp_cv, 'g', label='cv(area = %0.2f)' % best_area_cv) plt.plot(best_fp_train, best_tp_train, 'r', label='train(area = %0.2f)' % best_area_train) plt.legend(loc='lower right') plt.savefig(directory + name) writeFile.close() if learn: file = open('./model_files/svm_c_values.pkl', 'w') pickle.dump(optimal_c, file) file.close()
def labelClassificationD2V(): model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model") reports = preprocess.getReports() processedReports = preprocess.getProcessedReports() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(model.infer_vector(processedReports[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels) - (list(labels).count("positive")) * 2)): break labelledCorpus = np.delete(labelledCorpus, deletes, axis=0) labels = np.delete(labels, deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData / numFolds)) for n in range(0, numFolds): # split training and test data train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split( labelledCorpus, labels, test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit( train_labelledCorpus, train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function( test_labelledCorpus) output_scores_train = classifier.decision_function( train_labelledCorpus) # sort scores and labels in order sortList = list( zip(output_scores_test, output_test, test_labels, test_labelledCorpus)) sortList.sort() output_scores_test, output_test, test_labels, test_labelledCorpus = zip( *sortList) # build roc curve and plot fp_test, tp_test, _ = roc_curve(test_labels, output_scores_test, pos_label="positive") fp_train, tp_train, _ = roc_curve(train_labels, output_scores_train, pos_label="positive") plt.plot(fp_test, tp_test, 'r', label="train" if n == 0 else "") plt.plot(fp_train, tp_train, 'b', label="test" if n == 0 else "") plt.legend(loc='lower right') plt.savefig(directory + name) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([ output_scores_test[r], output_test[r], test_labels[r] ]) writer.writerow([labelledReports[reportIdx]]) # plt.show() writeFile.close()
def labelClassification(): corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [ list(x) for x in zip(*gensim.matutils.corpus2dense( corpus, corpus.num_terms, dtype=np.float64)) ] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_classification/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledCorpus = [] # print(range(getNumReports(REPORT_FILES[:j]),getNumReports(REPORT_FILES[:j])+getNumReports([REPORT_FILES_LABELLED[j]]))) # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append((corpusList[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels) - (list(labels).count("positive")) * 2)): break labelledCorpus = np.delete(labelledCorpus, deletes, axis=0) labels = np.delete(labels, deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData / numFolds)) for n in range(0, numFolds): # split training and test data train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split( labelledCorpus, labels, test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit( train_labelledCorpus, train_labels) # classifier = svm.LinearSVC(C=1.0).fit(train_labelledCorpus,train_labels) # classifier = neighbors.KNeighborsClassifier(n_neighbors=3).fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function( test_labelledCorpus) output_scores_train = classifier.decision_function( train_labelledCorpus) # sort scores and labels in order sortList = list( zip(output_scores_test, output_test, test_labels, test_labelledCorpus)) sortList.sort() output_scores_test, output_test, test_labels, test_labelledCorpus = zip( *sortList) if n == 0: all_test_labels = test_labels all_output_scores_test = output_scores_test all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + test_labels all_output_scores_test = all_output_scores_test + output_scores_test all_train_labels = all_train_labels + tuple(train_labels) all_output_scores_train = all_output_scores_train + tuple( output_scores_train) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([ output_scores_test[r], output_test[r], test_labels[r] ]) writer.writerow([reports[reportIdx]]) # generate the roc curve fp_test, tp_test, _ = roc_curve(all_test_labels, all_output_scores_test, pos_label="positive") fp_train, tp_train, _ = roc_curve(all_train_labels, all_output_scores_train, pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_train = auc(fp_train, tp_train) # Plot the average ROC curves plt.plot(fp_test, tp_test, 'b', label='test(area = %0.2f)' % area_test) plt.plot(fp_train, tp_train, 'r', label='train(area = %0.2f)' % area_train) plt.legend(loc='lower right') plt.savefig(directory + name) writeFile.close()
def testClassification(threshold,fileType): REPORT_FILES = [('Cleaned' + fileType + 'Full.csv')] REPORT_FILES_LABELLED = [('Cleaned' + fileType + 'Labelled.csv')] DIAGNOSES = [fileType] corpus = gensim.corpora.MmCorpus('../model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports(fileType) numFolds = 5 # number of folds for cross validation with open("labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): #writer.writerow("") #writer.writerow("") writer.writerow([DIAGNOSES[j],"",""]) # Added "" for csv parsing # fetch corpus and labels labelledCorpus = [] unlabelledCorpus = [] # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append(corpusList[i]) for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])): unlabelledCorpus.append(corpusList[i]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set # build classifier classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels) print "" print "Model parameters:" print classifier.coef_ print "" print "L2 norm of current model: " + str(np.linalg.norm(classifier.coef_)) print "" for i in range(len(classifier.coef_)): parameters.append(classifier.coef_[i]) # compute output label and corresponding score output_test = classifier.predict(unlabelledCorpus) output_scores_test = classifier.decision_function(unlabelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,unlabelledCorpus)) sortList.sort() output_scores_test,output_test,unlabelledCorpus = zip(*sortList) # save result to file for r in range(len(unlabelledCorpus)): if (abs(output_scores_test[r]) < threshold): reportIdx = corpusList.index(list(unlabelledCorpus[r])) # writer.writerow("") # Removing newline to help with future parsing writer.writerow([reportIdx,output_scores_test[r],output_test[r]]) writer.writerow([reports[reportIdx],"",""]) # Added extra "" to make csv parsing work writeFile.close() # Write model parameters to file with open("coef.csv",'w') as fout: writer = csv.writer(fout) for i in range(len(parameters)): writer.writerow(parameters[i]) print "Model parameters saved to file."
def testClassification(): threashold = 0.001 corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [ list(x) for x in zip(*gensim.matutils.corpus2dense( corpus, corpus.num_terms, dtype=np.float64)) ] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_tests/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledCorpus = [] unlabelledCorpus = [] # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append(corpusList[i]) for i in range( preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES[j]])): unlabelledCorpus.append(corpusList[i]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels) - (list(labels).count("positive")) * 2)): break labelledCorpus = np.delete(labelledCorpus, deletes, axis=0) labels = np.delete(labels, deletes) ################## numData = len(labels) # size of the labelled data set # build classifier classifier = svm.SVC(kernel='linear').fit(labelledCorpus, labels) # compute output label and corresponding score output_test = classifier.predict(unlabelledCorpus) output_scores_test = classifier.decision_function(unlabelledCorpus) # sort scores and labels in order sortList = list( zip(output_scores_test, output_test, unlabelledCorpus)) sortList.sort() output_scores_test, output_test, unlabelledCorpus = zip(*sortList) # save result to file for r in range(len(unlabelledCorpus)): if (abs(output_scores_test[r]) < threashold): reportIdx = corpusList.index(list(unlabelledCorpus[r])) writer.writerow("") writer.writerow( [reportIdx, output_scores_test[r], output_test[r]]) writer.writerow([reports[reportIdx]]) writeFile.close()
# rnn.compareSentences("There is a intracranial haemorrhage","There is a haemorrhage in the cranium") # rnn.compareSentences("There is no intracranial haemorrhage","There is a haemorrhage in the cranium") # rnn.compareSentences("There is a intracranial haemorrhage","The study is within normal limits") # rnn.compareSentences("There is a intracranial haemorrhage.","There is a haemorrhage in the cranium.") # rnn.compareSentences("There is no intracranial haemorrhage.","There is a haemorrhage in the cranium.") # rnn.compareSentences("There is a intracranial haemorrhage.","The study is within normal limits.") # rnn.nextWords("VENTRICULAR CALIBRE IS WITHIN NORMAL LIMITS FOR AGE AND IT IS") # rnn.nextWords("VENTRICULAR CALIBRE IS WITHIN NORMAL LIMITS FOR AGE") # rnn.nextWords("NO INTRACEREBRAL HAEMATOMA OR") # rnn.nextWords("left sided embolus") # rnn.reportsToDense() # rnn.buildReportRNN(epochs=180) # rnn.buildReportRNN(epochs=20,continueTraining=True) # rnn.reportToEncoder() # rnn.reports2vecs() # generateReports.labelClassificationRNN() # generateReports.labelClassificationRNN(learn=False) print("loading reports") reports = preprocess.getReports() print("loaded reports") print("report 1:") print(reports[300]) print("report 2:") print(reports[3000]) print(rnn.compareReportSentences(reports[300],reports[3000]))
def labelClassificationRNN(learn=True): if learn: c_vals = [[0.001,0.001,0.001,0.001]] c_vals = [[0.005,0.005,0.005,0.005]] c_vals.append([0.01,0.01,0.01,0.01]) c_vals.append([0.05,0.05,0.05,0.05]) c_vals.append([0.1,0.1,0.1,0.1]) c_vals.append([0.5,0.5,0.5,0.5]) c_vals.append([1,1,1,1]) optimal_c = [[0,0,0,0]] else: file = open('./model_files/svm_c_values.pkl', 'r') c_vals = pickle.load(file) optimal_c = c_vals file.close() reports = preprocess.getReports() reportVectors = rnn.loadReportVecs() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(reportVectors[i][:]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. # count = 0 # deletes = [] # for x in range(len(labels)): # if (labels[x] == "negative"): # count = count + 1 # deletes.append(x) # if (count == (len(labels)-(list(labels).count("positive"))*2)): # break # labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) # labels = np.delete(labels,deletes) ################## best_area_cv = -1 for c_value in c_vals: for n in range(numFolds): # split training and test data train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.15) # Split of the last 20% of training set for cross validation cv_labelledCorpus = train_labelledCorpus[int(0.8*len(train_labelledCorpus)):] train_labelledCorpus = train_labelledCorpus[:int(0.8*len(train_labelledCorpus))] cv_labels = train_labels[int(0.8*len(train_labels)):] train_labels = train_labels[:int(0.8*len(train_labels))] # build classifier classifier = svm.SVC(C=c_value[j],kernel='linear').fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_cv = classifier.predict(cv_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function(test_labelledCorpus) output_scores_train = classifier.decision_function(train_labelledCorpus) output_scores_cv = classifier.decision_function(cv_labelledCorpus) if n ==0: all_test_labels = tuple(test_labels) all_output_scores_test = tuple(output_scores_test) all_cv_labels = tuple(cv_labels) all_output_scores_cv = tuple(output_scores_cv) all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + tuple(test_labels) all_output_scores_test = all_output_scores_test + tuple(output_scores_test) all_cv_labels = all_cv_labels + tuple(cv_labels) all_output_scores_cv = all_output_scores_cv + tuple(output_scores_cv) all_train_labels = all_train_labels + tuple(train_labels) all_output_scores_train = all_output_scores_train+ tuple(output_scores_train) # save result for fold to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("With c value: "+str(c_value[j])) writer.writerow([output_scores_test[r],output_test[r],test_labels[r]]) writer.writerow([labelledReports[reportIdx]]) # generate the roc curve fp_test,tp_test,_ = roc_curve(all_test_labels,all_output_scores_test,pos_label="positive") fp_cv,tp_cv,_ = roc_curve(all_cv_labels,all_output_scores_cv,pos_label="positive") fp_train,tp_train,_ = roc_curve(all_train_labels,all_output_scores_train,pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_cv = auc(fp_cv, tp_cv) area_train = auc(fp_train, tp_train) # Store c value,tps, fps and aucs if cv auc is new best if area_cv > best_area_cv: optimal_c[0][j] = c_value[j] best_fp_test=fp_test best_tp_test=tp_test best_fp_cv=fp_cv best_tp_cv=tp_cv best_fp_train=fp_train best_tp_train=tp_train best_area_test=area_test best_area_cv=area_cv best_area_train=area_train # initialise and plot the average ROC curves for optimal c value name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC: c value of "+str(optimal_c[0][j])) plt.plot(best_fp_test,best_tp_test,'b',label='test(area = %0.2f)' % best_area_test) plt.plot(best_fp_cv,best_tp_cv,'g',label='cv(area = %0.2f)' % best_area_cv) plt.plot(best_fp_train,best_tp_train,'r',label='train(area = %0.2f)' % best_area_train) plt.legend(loc='lower right') plt.savefig(directory+name) writeFile.close() if learn: file = open('./model_files/svm_c_values.pkl', 'w') pickle.dump(optimal_c,file) file.close()
def labelClassificationD2V(): model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model") reports = preprocess.getReports() processedReports = preprocess.getProcessedReports() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(model.infer_vector(processedReports[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData/numFolds)) for n in range(0,numFolds): # split training and test data train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function(test_labelledCorpus) output_scores_train = classifier.decision_function(train_labelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus)) sortList.sort() output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList) # build roc curve and plot fp_test,tp_test,_ = roc_curve(test_labels,output_scores_test,pos_label="positive") fp_train,tp_train,_ = roc_curve(train_labels,output_scores_train,pos_label="positive") plt.plot(fp_test,tp_test,'r',label="train" if n == 0 else "") plt.plot(fp_train,tp_train,'b',label="test" if n == 0 else "") plt.legend(loc='lower right') plt.savefig(directory+name) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([output_scores_test[r],output_test[r],test_labels[r]]) writer.writerow([labelledReports[reportIdx]]) # plt.show() writeFile.close()
def labelClassification(): corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledCorpus = [] # print(range(getNumReports(REPORT_FILES[:j]),getNumReports(REPORT_FILES[:j])+getNumReports([REPORT_FILES_LABELLED[j]]))) # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append((corpusList[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData/numFolds)) for n in range(0,numFolds): # split training and test data train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels) # classifier = svm.LinearSVC(C=1.0).fit(train_labelledCorpus,train_labels) # classifier = neighbors.KNeighborsClassifier(n_neighbors=3).fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function(test_labelledCorpus) output_scores_train = classifier.decision_function(train_labelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus)) sortList.sort() output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList) if n ==0: all_test_labels = test_labels all_output_scores_test = output_scores_test all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + test_labels all_output_scores_test = all_output_scores_test + output_scores_test all_train_labels = all_train_labels + tuple(train_labels) all_output_scores_train = all_output_scores_train+ tuple(output_scores_train) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([output_scores_test[r],output_test[r],test_labels[r]]) writer.writerow([reports[reportIdx]]) # generate the roc curve fp_test,tp_test,_ = roc_curve(all_test_labels,all_output_scores_test,pos_label="positive") fp_train,tp_train,_ = roc_curve(all_train_labels,all_output_scores_train,pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_train = auc(fp_train, tp_train) # Plot the average ROC curves plt.plot(fp_test,tp_test,'b',label='test(area = %0.2f)' % area_test) plt.plot(fp_train,tp_train,'r',label='train(area = %0.2f)' % area_train) plt.legend(loc='lower right') plt.savefig(directory+name) writeFile.close()
def testClassification(): threashold = 0.001 corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_tests/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledCorpus = [] unlabelledCorpus = [] # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append(corpusList[i]) for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])): unlabelledCorpus.append(corpusList[i]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set # build classifier classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels) # compute output label and corresponding score output_test = classifier.predict(unlabelledCorpus) output_scores_test = classifier.decision_function(unlabelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,unlabelledCorpus)) sortList.sort() output_scores_test,output_test,unlabelledCorpus = zip(*sortList) # save result to file for r in range(len(unlabelledCorpus)): if (abs(output_scores_test[r]) < threashold): reportIdx = corpusList.index(list(unlabelledCorpus[r])) writer.writerow("") writer.writerow([reportIdx,output_scores_test[r],output_test[r]]) writer.writerow([reports[reportIdx]]) writeFile.close()