def buildDoc2VecModel(): reports = preprocess.getProcessedReports() # construct sentences from reports taggedDocuments = [] for i in range(len(reports)): taggedDocument = gensim.models.doc2vec.TaggedDocument(words= reports[i], tags= [i]) taggedDocuments.append(taggedDocument) # model = gensim.models.Doc2Vec(taggedDocuments) model = gensim.models.Doc2Vec(size=300, min_count=5, workers=16,dm=1, dbow_words=1,negative=20) model.build_vocab(taggedDocuments) model.alpha = 0.025 # learning rate for epoch in range(10): print(epoch) model.train(taggedDocuments) model.alpha -= 0.001 model.min_alpha = model.alpha model.save("./model_files/reports.doc2vec_model")
def buildDoc2VecModel(): reports = preprocess.getProcessedReports() # construct sentences from reports taggedDocuments = [] for i in range(len(reports)): taggedDocument = gensim.models.doc2vec.TaggedDocument(words=reports[i], tags=[i]) taggedDocuments.append(taggedDocument) # model = gensim.models.Doc2Vec(taggedDocuments) model = gensim.models.Doc2Vec(size=300, min_count=5, workers=16, dm=1, dbow_words=1, negative=20) model.build_vocab(taggedDocuments) model.alpha = 0.025 # learning rate for epoch in range(10): print(epoch) model.train(taggedDocuments) model.alpha -= 0.001 model.min_alpha = model.alpha model.save("./model_files/reports.doc2vec_model")
def buildDictionary(fileType): reports = preprocess.getProcessedReports(fileType) print("files loaded") # build dictionary dictionary = gensim.corpora.Dictionary(reports) # dictionary.filter_extremes(no_below=3) dictionary.save('../model_files/reports.dict') print(dictionary) print("dictionary finished") # build corpus corpus = [dictionary.doc2bow(report) for report in reports] gensim.corpora.MmCorpus.serialize('../model_files/reports.mm', corpus) # print(corpus) print("corpus finished")
def buildWord2VecModel(): reports = preprocess.getProcessedReports() model = gensim.models.Word2Vec(reports, min_count=3) model.init_sims(replace=True) model.save("./model_files/reports.word2vec_model") print(model) # model = gensim.models.Word2Vec.load("zzmodel") print("----------------------------------similarity test") print(model.similarity("head","brain")) print("----------------------------------raw numpy vector of word") print(model["age"]) print("----------------------------------remove outlier") print(model.doesnt_match("hours four age".split())) print("----------------------------------similar words") print(model.most_similar("haem")) print("script finished")
def buildDictionary(): reports = preprocess.getProcessedReports() print("files loaded") # build dictionary dictionary = gensim.corpora.Dictionary(reports) # dictionary.filter_extremes(no_below=3) dictionary.save('./model_files/reports.dict') print(dictionary) print("dictionary finished") # build corpus corpus = [dictionary.doc2bow(report) for report in reports] gensim.corpora.MmCorpus.serialize('./model_files/reports.mm', corpus) # print(corpus) print("corpus finished")
def buildWord2VecModel(): reports = preprocess.getProcessedReports() model = gensim.models.Word2Vec(reports, min_count=3) model.init_sims(replace=True) model.save("./model_files/reports.word2vec_model") print(model) # model = gensim.models.Word2Vec.load("zzmodel") print("----------------------------------similarity test") print(model.similarity("head", "brain")) print("----------------------------------raw numpy vector of word") print(model["age"]) print("----------------------------------remove outlier") print(model.doesnt_match("hours four age".split())) print("----------------------------------similar words") print(model.most_similar("haem")) print("script finished")
def labelClassificationD2V(): model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model") reports = preprocess.getReports() processedReports = preprocess.getProcessedReports() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(model.infer_vector(processedReports[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels) - (list(labels).count("positive")) * 2)): break labelledCorpus = np.delete(labelledCorpus, deletes, axis=0) labels = np.delete(labels, deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData / numFolds)) for n in range(0, numFolds): # split training and test data train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split( labelledCorpus, labels, test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit( train_labelledCorpus, train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function( test_labelledCorpus) output_scores_train = classifier.decision_function( train_labelledCorpus) # sort scores and labels in order sortList = list( zip(output_scores_test, output_test, test_labels, test_labelledCorpus)) sortList.sort() output_scores_test, output_test, test_labels, test_labelledCorpus = zip( *sortList) # build roc curve and plot fp_test, tp_test, _ = roc_curve(test_labels, output_scores_test, pos_label="positive") fp_train, tp_train, _ = roc_curve(train_labels, output_scores_train, pos_label="positive") plt.plot(fp_test, tp_test, 'r', label="train" if n == 0 else "") plt.plot(fp_train, tp_train, 'b', label="test" if n == 0 else "") plt.legend(loc='lower right') plt.savefig(directory + name) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([ output_scores_test[r], output_test[r], test_labels[r] ]) writer.writerow([labelledReports[reportIdx]]) # plt.show() writeFile.close()
def labelClassificationD2V(): model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model") reports = preprocess.getReports() processedReports = preprocess.getProcessedReports() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(model.infer_vector(processedReports[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData/numFolds)) for n in range(0,numFolds): # split training and test data train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function(test_labelledCorpus) output_scores_train = classifier.decision_function(train_labelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus)) sortList.sort() output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList) # build roc curve and plot fp_test,tp_test,_ = roc_curve(test_labels,output_scores_test,pos_label="positive") fp_train,tp_train,_ = roc_curve(train_labels,output_scores_train,pos_label="positive") plt.plot(fp_test,tp_test,'r',label="train" if n == 0 else "") plt.plot(fp_train,tp_train,'b',label="test" if n == 0 else "") plt.legend(loc='lower right') plt.savefig(directory+name) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([output_scores_test[r],output_test[r],test_labels[r]]) writer.writerow([labelledReports[reportIdx]]) # plt.show() writeFile.close()