def labelClassificationRNN(learn=True): if learn: c_vals = [[0.001, 0.001, 0.001, 0.001]] c_vals = [[0.005, 0.005, 0.005, 0.005]] c_vals.append([0.01, 0.01, 0.01, 0.01]) c_vals.append([0.05, 0.05, 0.05, 0.05]) c_vals.append([0.1, 0.1, 0.1, 0.1]) c_vals.append([0.5, 0.5, 0.5, 0.5]) c_vals.append([1, 1, 1, 1]) optimal_c = [[0, 0, 0, 0]] else: file = open('./model_files/svm_c_values.pkl', 'r') c_vals = pickle.load(file) optimal_c = c_vals file.close() reports = preprocess.getReports() reportVectors = rnn.loadReportVecs() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(reportVectors[i][:]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. # count = 0 # deletes = [] # for x in range(len(labels)): # if (labels[x] == "negative"): # count = count + 1 # deletes.append(x) # if (count == (len(labels)-(list(labels).count("positive"))*2)): # break # labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) # labels = np.delete(labels,deletes) ################## best_area_cv = -1 for c_value in c_vals: for n in range(numFolds): # split training and test data train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split( labelledCorpus, labels, test_size=0.15) # Split of the last 20% of training set for cross validation cv_labelledCorpus = train_labelledCorpus[ int(0.8 * len(train_labelledCorpus)):] train_labelledCorpus = train_labelledCorpus[:int( 0.8 * len(train_labelledCorpus))] cv_labels = train_labels[int(0.8 * len(train_labels)):] train_labels = train_labels[:int(0.8 * len(train_labels))] # build classifier classifier = svm.SVC(C=c_value[j], kernel='linear').fit( train_labelledCorpus, train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_cv = classifier.predict(cv_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function( test_labelledCorpus) output_scores_train = classifier.decision_function( train_labelledCorpus) output_scores_cv = classifier.decision_function( cv_labelledCorpus) if n == 0: all_test_labels = tuple(test_labels) all_output_scores_test = tuple(output_scores_test) all_cv_labels = tuple(cv_labels) all_output_scores_cv = tuple(output_scores_cv) all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + tuple(test_labels) all_output_scores_test = all_output_scores_test + tuple( output_scores_test) all_cv_labels = all_cv_labels + tuple(cv_labels) all_output_scores_cv = all_output_scores_cv + tuple( output_scores_cv) all_train_labels = all_train_labels + tuple( train_labels) all_output_scores_train = all_output_scores_train + tuple( output_scores_train) # save result for fold to file for r in range(len(test_labels)): reportIdx = corpusList.index( list(test_labelledCorpus[r])) writer.writerow("With c value: " + str(c_value[j])) writer.writerow([ output_scores_test[r], output_test[r], test_labels[r] ]) writer.writerow([labelledReports[reportIdx]]) # generate the roc curve fp_test, tp_test, _ = roc_curve(all_test_labels, all_output_scores_test, pos_label="positive") fp_cv, tp_cv, _ = roc_curve(all_cv_labels, all_output_scores_cv, pos_label="positive") fp_train, tp_train, _ = roc_curve(all_train_labels, all_output_scores_train, pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_cv = auc(fp_cv, tp_cv) area_train = auc(fp_train, tp_train) # Store c value,tps, fps and aucs if cv auc is new best if area_cv > best_area_cv: optimal_c[0][j] = c_value[j] best_fp_test = fp_test best_tp_test = tp_test best_fp_cv = fp_cv best_tp_cv = tp_cv best_fp_train = fp_train best_tp_train = tp_train best_area_test = area_test best_area_cv = area_cv best_area_train = area_train # initialise and plot the average ROC curves for optimal c value name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC: c value of " + str(optimal_c[0][j])) plt.plot(best_fp_test, best_tp_test, 'b', label='test(area = %0.2f)' % best_area_test) plt.plot(best_fp_cv, best_tp_cv, 'g', label='cv(area = %0.2f)' % best_area_cv) plt.plot(best_fp_train, best_tp_train, 'r', label='train(area = %0.2f)' % best_area_train) plt.legend(loc='lower right') plt.savefig(directory + name) writeFile.close() if learn: file = open('./model_files/svm_c_values.pkl', 'w') pickle.dump(optimal_c, file) file.close()
def precisionRecall(testFile): models = ["bow", "tfidf", "lsi", "lda", "doc2vec", "rnn"] # Create the output directory directory = "precision_recall/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) tests = [] with open(testFile, 'rb') as file: reader = csv.reader(file) for row in reader: tests.append(row) file.close() thres = [ 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 ] numReports = [ preprocess.getNumReports(REPORT_FILES[:1]), preprocess.getNumReports(REPORT_FILES[:2]), preprocess.getNumReports(REPORT_FILES[:3]), preprocess.getNumReports() ] numResults = preprocess.getNumReports() for searchTerm in tests: print(searchTerm) plt.figure(searchTerm[0]) plt.xlabel("Recall") plt.ylabel("Precision") plt.title(searchTerm[0]) with open(directory + searchTerm[0] + ".csv", 'w') as writeFile: writer = csv.writer(writeFile) for model in models: writer.writerow([model]) precision = [] recall = [] allReports = search.search(model, numResults, searchTerm[0]) for i in range(len(thres)): truePositive = 0 retrieved = 0 # retreieved = truePositive + falsePositive relevant = 0 # relevant = truePositive + falseNegative similarReports = [ report for report in allReports if report[1] > thres[i] ] for reportIdx in similarReports: if reportIdx[0] < numReports[0]: # prediction: brains if (searchTerm[1] == "Brains"): # actual: brains truePositive = truePositive + 1 # print "brains" elif reportIdx[0] < numReports[1]: if (searchTerm[1] == "CTPA"): truePositive = truePositive + 1 # print "ctpa" elif reportIdx[0] < numReports[2]: if (searchTerm[1] == "Plainab"): truePositive = truePositive + 1 # print "plainab" elif reportIdx[0] < numReports[3]: if (searchTerm[1] == "Pvab"): truePositive = truePositive + 1 # print "pvab" else: print "error" retrieved = retrieved + len(similarReports) relevant = relevant + preprocess.getNumReports( ["nlp_data/Cleaned" + searchTerm[1] + "Full.csv"]) precision.append((truePositive / retrieved) if retrieved else 0) recall.append((truePositive / relevant) if relevant else 0) writer.writerow([precision[i - 1], recall[i - 1]]) writer.writerow("") # plot the data point plt.plot(recall, precision, label=model) writeFile.close() plt.legend(loc='lower right') fileName = directory + searchTerm[0] plt.savefig(fileName)
def labelClassification(): corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [ list(x) for x in zip(*gensim.matutils.corpus2dense( corpus, corpus.num_terms, dtype=np.float64)) ] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_classification/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledCorpus = [] # print(range(getNumReports(REPORT_FILES[:j]),getNumReports(REPORT_FILES[:j])+getNumReports([REPORT_FILES_LABELLED[j]]))) # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append((corpusList[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels) - (list(labels).count("positive")) * 2)): break labelledCorpus = np.delete(labelledCorpus, deletes, axis=0) labels = np.delete(labels, deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData / numFolds)) for n in range(0, numFolds): # split training and test data train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split( labelledCorpus, labels, test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit( train_labelledCorpus, train_labels) # classifier = svm.LinearSVC(C=1.0).fit(train_labelledCorpus,train_labels) # classifier = neighbors.KNeighborsClassifier(n_neighbors=3).fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function( test_labelledCorpus) output_scores_train = classifier.decision_function( train_labelledCorpus) # sort scores and labels in order sortList = list( zip(output_scores_test, output_test, test_labels, test_labelledCorpus)) sortList.sort() output_scores_test, output_test, test_labels, test_labelledCorpus = zip( *sortList) if n == 0: all_test_labels = test_labels all_output_scores_test = output_scores_test all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + test_labels all_output_scores_test = all_output_scores_test + output_scores_test all_train_labels = all_train_labels + tuple(train_labels) all_output_scores_train = all_output_scores_train + tuple( output_scores_train) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([ output_scores_test[r], output_test[r], test_labels[r] ]) writer.writerow([reports[reportIdx]]) # generate the roc curve fp_test, tp_test, _ = roc_curve(all_test_labels, all_output_scores_test, pos_label="positive") fp_train, tp_train, _ = roc_curve(all_train_labels, all_output_scores_train, pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_train = auc(fp_train, tp_train) # Plot the average ROC curves plt.plot(fp_test, tp_test, 'b', label='test(area = %0.2f)' % area_test) plt.plot(fp_train, tp_train, 'r', label='train(area = %0.2f)' % area_train) plt.legend(loc='lower right') plt.savefig(directory + name) writeFile.close()
def labelClassificationD2V(): model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model") reports = preprocess.getReports() processedReports = preprocess.getProcessedReports() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(model.infer_vector(processedReports[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels) - (list(labels).count("positive")) * 2)): break labelledCorpus = np.delete(labelledCorpus, deletes, axis=0) labels = np.delete(labels, deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData / numFolds)) for n in range(0, numFolds): # split training and test data train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split( labelledCorpus, labels, test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit( train_labelledCorpus, train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function( test_labelledCorpus) output_scores_train = classifier.decision_function( train_labelledCorpus) # sort scores and labels in order sortList = list( zip(output_scores_test, output_test, test_labels, test_labelledCorpus)) sortList.sort() output_scores_test, output_test, test_labels, test_labelledCorpus = zip( *sortList) # build roc curve and plot fp_test, tp_test, _ = roc_curve(test_labels, output_scores_test, pos_label="positive") fp_train, tp_train, _ = roc_curve(train_labels, output_scores_train, pos_label="positive") plt.plot(fp_test, tp_test, 'r', label="train" if n == 0 else "") plt.plot(fp_train, tp_train, 'b', label="test" if n == 0 else "") plt.legend(loc='lower right') plt.savefig(directory + name) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([ output_scores_test[r], output_test[r], test_labels[r] ]) writer.writerow([labelledReports[reportIdx]]) # plt.show() writeFile.close()
def testClassification(): threashold = 0.001 corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [ list(x) for x in zip(*gensim.matutils.corpus2dense( corpus, corpus.num_terms, dtype=np.float64)) ] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_tests/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledCorpus = [] unlabelledCorpus = [] # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append(corpusList[i]) for i in range( preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES[j]])): unlabelledCorpus.append(corpusList[i]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels) - (list(labels).count("positive")) * 2)): break labelledCorpus = np.delete(labelledCorpus, deletes, axis=0) labels = np.delete(labels, deletes) ################## numData = len(labels) # size of the labelled data set # build classifier classifier = svm.SVC(kernel='linear').fit(labelledCorpus, labels) # compute output label and corresponding score output_test = classifier.predict(unlabelledCorpus) output_scores_test = classifier.decision_function(unlabelledCorpus) # sort scores and labels in order sortList = list( zip(output_scores_test, output_test, unlabelledCorpus)) sortList.sort() output_scores_test, output_test, unlabelledCorpus = zip(*sortList) # save result to file for r in range(len(unlabelledCorpus)): if (abs(output_scores_test[r]) < threashold): reportIdx = corpusList.index(list(unlabelledCorpus[r])) writer.writerow("") writer.writerow( [reportIdx, output_scores_test[r], output_test[r]]) writer.writerow([reports[reportIdx]]) writeFile.close()
def testClassification(threshold,fileType): REPORT_FILES = [('Cleaned' + fileType + 'Full.csv')] REPORT_FILES_LABELLED = [('Cleaned' + fileType + 'Labelled.csv')] DIAGNOSES = [fileType] corpus = gensim.corpora.MmCorpus('../model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports(fileType) numFolds = 5 # number of folds for cross validation with open("labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): #writer.writerow("") #writer.writerow("") writer.writerow([DIAGNOSES[j],"",""]) # Added "" for csv parsing # fetch corpus and labels labelledCorpus = [] unlabelledCorpus = [] # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append(corpusList[i]) for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])): unlabelledCorpus.append(corpusList[i]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set # build classifier classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels) print "" print "Model parameters:" print classifier.coef_ print "" print "L2 norm of current model: " + str(np.linalg.norm(classifier.coef_)) print "" for i in range(len(classifier.coef_)): parameters.append(classifier.coef_[i]) # compute output label and corresponding score output_test = classifier.predict(unlabelledCorpus) output_scores_test = classifier.decision_function(unlabelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,unlabelledCorpus)) sortList.sort() output_scores_test,output_test,unlabelledCorpus = zip(*sortList) # save result to file for r in range(len(unlabelledCorpus)): if (abs(output_scores_test[r]) < threshold): reportIdx = corpusList.index(list(unlabelledCorpus[r])) # writer.writerow("") # Removing newline to help with future parsing writer.writerow([reportIdx,output_scores_test[r],output_test[r]]) writer.writerow([reports[reportIdx],"",""]) # Added extra "" to make csv parsing work writeFile.close() # Write model parameters to file with open("coef.csv",'w') as fout: writer = csv.writer(fout) for i in range(len(parameters)): writer.writerow(parameters[i]) print "Model parameters saved to file."
def precisionRecall(testFile): models = ["bow","tfidf","lsi","lda","doc2vec","rnn"] # Create the output directory directory = "precision_recall/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) tests = [] with open(testFile,'rb') as file: reader = csv.reader(file) for row in reader: tests.append(row) file.close() thres = [0.01,0.02,0.03,0.04,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] numReports = [preprocess.getNumReports(REPORT_FILES[:1]), preprocess.getNumReports(REPORT_FILES[:2]), preprocess.getNumReports(REPORT_FILES[:3]),preprocess.getNumReports()] numResults = preprocess.getNumReports() for searchTerm in tests: print(searchTerm) plt.figure(searchTerm[0]) plt.xlabel("Recall") plt.ylabel("Precision") plt.title(searchTerm[0]) with open(directory + searchTerm[0] + ".csv",'w') as writeFile: writer = csv.writer(writeFile) for model in models: writer.writerow([model]) precision = [] recall = [] allReports = search.search(model,numResults,searchTerm[0]) for i in range(len(thres)): truePositive = 0 retrieved = 0 # retreieved = truePositive + falsePositive relevant = 0 # relevant = truePositive + falseNegative similarReports = [report for report in allReports if report[1] > thres[i]] for reportIdx in similarReports: if reportIdx[0] < numReports[0]: # prediction: brains if (searchTerm[1] == "Brains"): # actual: brains truePositive = truePositive + 1 # print "brains" elif reportIdx[0] < numReports[1]: if (searchTerm[1] == "CTPA"): truePositive = truePositive + 1 # print "ctpa" elif reportIdx[0] < numReports[2]: if (searchTerm[1] == "Plainab"): truePositive = truePositive + 1 # print "plainab" elif reportIdx[0] < numReports[3]: if (searchTerm[1] == "Pvab"): truePositive = truePositive + 1 # print "pvab" else: print "error" retrieved = retrieved + len(similarReports) relevant = relevant + preprocess.getNumReports(["nlp_data/Cleaned" + searchTerm[1] + "Full.csv"]) precision.append((truePositive/retrieved) if retrieved else 0) recall.append((truePositive/relevant) if relevant else 0) writer.writerow([precision[i-1],recall[i-1]]) writer.writerow("") # plot the data point plt.plot(recall,precision,label=model) writeFile.close() plt.legend(loc='lower right') fileName = directory + searchTerm[0] plt.savefig(fileName)
def labelClassificationRNN(learn=True): if learn: c_vals = [[0.001,0.001,0.001,0.001]] c_vals = [[0.005,0.005,0.005,0.005]] c_vals.append([0.01,0.01,0.01,0.01]) c_vals.append([0.05,0.05,0.05,0.05]) c_vals.append([0.1,0.1,0.1,0.1]) c_vals.append([0.5,0.5,0.5,0.5]) c_vals.append([1,1,1,1]) optimal_c = [[0,0,0,0]] else: file = open('./model_files/svm_c_values.pkl', 'r') c_vals = pickle.load(file) optimal_c = c_vals file.close() reports = preprocess.getReports() reportVectors = rnn.loadReportVecs() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(reportVectors[i][:]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. # count = 0 # deletes = [] # for x in range(len(labels)): # if (labels[x] == "negative"): # count = count + 1 # deletes.append(x) # if (count == (len(labels)-(list(labels).count("positive"))*2)): # break # labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) # labels = np.delete(labels,deletes) ################## best_area_cv = -1 for c_value in c_vals: for n in range(numFolds): # split training and test data train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.15) # Split of the last 20% of training set for cross validation cv_labelledCorpus = train_labelledCorpus[int(0.8*len(train_labelledCorpus)):] train_labelledCorpus = train_labelledCorpus[:int(0.8*len(train_labelledCorpus))] cv_labels = train_labels[int(0.8*len(train_labels)):] train_labels = train_labels[:int(0.8*len(train_labels))] # build classifier classifier = svm.SVC(C=c_value[j],kernel='linear').fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_cv = classifier.predict(cv_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function(test_labelledCorpus) output_scores_train = classifier.decision_function(train_labelledCorpus) output_scores_cv = classifier.decision_function(cv_labelledCorpus) if n ==0: all_test_labels = tuple(test_labels) all_output_scores_test = tuple(output_scores_test) all_cv_labels = tuple(cv_labels) all_output_scores_cv = tuple(output_scores_cv) all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + tuple(test_labels) all_output_scores_test = all_output_scores_test + tuple(output_scores_test) all_cv_labels = all_cv_labels + tuple(cv_labels) all_output_scores_cv = all_output_scores_cv + tuple(output_scores_cv) all_train_labels = all_train_labels + tuple(train_labels) all_output_scores_train = all_output_scores_train+ tuple(output_scores_train) # save result for fold to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("With c value: "+str(c_value[j])) writer.writerow([output_scores_test[r],output_test[r],test_labels[r]]) writer.writerow([labelledReports[reportIdx]]) # generate the roc curve fp_test,tp_test,_ = roc_curve(all_test_labels,all_output_scores_test,pos_label="positive") fp_cv,tp_cv,_ = roc_curve(all_cv_labels,all_output_scores_cv,pos_label="positive") fp_train,tp_train,_ = roc_curve(all_train_labels,all_output_scores_train,pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_cv = auc(fp_cv, tp_cv) area_train = auc(fp_train, tp_train) # Store c value,tps, fps and aucs if cv auc is new best if area_cv > best_area_cv: optimal_c[0][j] = c_value[j] best_fp_test=fp_test best_tp_test=tp_test best_fp_cv=fp_cv best_tp_cv=tp_cv best_fp_train=fp_train best_tp_train=tp_train best_area_test=area_test best_area_cv=area_cv best_area_train=area_train # initialise and plot the average ROC curves for optimal c value name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC: c value of "+str(optimal_c[0][j])) plt.plot(best_fp_test,best_tp_test,'b',label='test(area = %0.2f)' % best_area_test) plt.plot(best_fp_cv,best_tp_cv,'g',label='cv(area = %0.2f)' % best_area_cv) plt.plot(best_fp_train,best_tp_train,'r',label='train(area = %0.2f)' % best_area_train) plt.legend(loc='lower right') plt.savefig(directory+name) writeFile.close() if learn: file = open('./model_files/svm_c_values.pkl', 'w') pickle.dump(optimal_c,file) file.close()
def labelClassificationD2V(): model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model") reports = preprocess.getReports() processedReports = preprocess.getProcessedReports() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(model.infer_vector(processedReports[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData/numFolds)) for n in range(0,numFolds): # split training and test data train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function(test_labelledCorpus) output_scores_train = classifier.decision_function(train_labelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus)) sortList.sort() output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList) # build roc curve and plot fp_test,tp_test,_ = roc_curve(test_labels,output_scores_test,pos_label="positive") fp_train,tp_train,_ = roc_curve(train_labels,output_scores_train,pos_label="positive") plt.plot(fp_test,tp_test,'r',label="train" if n == 0 else "") plt.plot(fp_train,tp_train,'b',label="test" if n == 0 else "") plt.legend(loc='lower right') plt.savefig(directory+name) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([output_scores_test[r],output_test[r],test_labels[r]]) writer.writerow([labelledReports[reportIdx]]) # plt.show() writeFile.close()
def labelClassification(): corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledCorpus = [] # print(range(getNumReports(REPORT_FILES[:j]),getNumReports(REPORT_FILES[:j])+getNumReports([REPORT_FILES_LABELLED[j]]))) # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append((corpusList[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData/numFolds)) for n in range(0,numFolds): # split training and test data train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels) # classifier = svm.LinearSVC(C=1.0).fit(train_labelledCorpus,train_labels) # classifier = neighbors.KNeighborsClassifier(n_neighbors=3).fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function(test_labelledCorpus) output_scores_train = classifier.decision_function(train_labelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus)) sortList.sort() output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList) if n ==0: all_test_labels = test_labels all_output_scores_test = output_scores_test all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + test_labels all_output_scores_test = all_output_scores_test + output_scores_test all_train_labels = all_train_labels + tuple(train_labels) all_output_scores_train = all_output_scores_train+ tuple(output_scores_train) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([output_scores_test[r],output_test[r],test_labels[r]]) writer.writerow([reports[reportIdx]]) # generate the roc curve fp_test,tp_test,_ = roc_curve(all_test_labels,all_output_scores_test,pos_label="positive") fp_train,tp_train,_ = roc_curve(all_train_labels,all_output_scores_train,pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_train = auc(fp_train, tp_train) # Plot the average ROC curves plt.plot(fp_test,tp_test,'b',label='test(area = %0.2f)' % area_test) plt.plot(fp_train,tp_train,'r',label='train(area = %0.2f)' % area_train) plt.legend(loc='lower right') plt.savefig(directory+name) writeFile.close()
def testClassification(): threashold = 0.001 corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_tests/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledCorpus = [] unlabelledCorpus = [] # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append(corpusList[i]) for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])): unlabelledCorpus.append(corpusList[i]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set # build classifier classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels) # compute output label and corresponding score output_test = classifier.predict(unlabelledCorpus) output_scores_test = classifier.decision_function(unlabelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,unlabelledCorpus)) sortList.sort() output_scores_test,output_test,unlabelledCorpus = zip(*sortList) # save result to file for r in range(len(unlabelledCorpus)): if (abs(output_scores_test[r]) < threashold): reportIdx = corpusList.index(list(unlabelledCorpus[r])) writer.writerow("") writer.writerow([reportIdx,output_scores_test[r],output_test[r]]) writer.writerow([reports[reportIdx]]) writeFile.close()