def training(): # get data && model [x_train, y_train], s = getData(input_dim, window_size, predict_days, data_frequency, "train", False) model = get_model(x_train) loss = [] # Fit && save model/history path = f"./model/epoch_{total_epochs},dim_{input_dim},win_{window_size},freq_{data_frequency}/" if not os.path.exists(path): os.mkdir(path, 755) #TODO: if there's existing model, pick up and keep training # Visualize the model print(model.summary()) # train for i in range(total_epochs): print(f'epoch: {i + 1}/{total_epochs}') history = model.fit(x_train, y_train, epochs=1, batch_size=batchSize) loss += history.history['loss'] if (i + 1) % 10 == 0: model.save(f'{path}epoch_{i}.h5') # save with open(f'{path}loss', 'wb') as fp: # print(loss) pickle.dump(loss, fp)
def training(): # get data && model [x_train, y_train], s = getData(input_dim, window_size, predict_days, "train", True) model = get_model(x_train) loss = [] # Fit && save model/history path = f"./model/class/epoch_{total_epochs},dim_{input_dim},win_{window_size}/" if not os.path.exists(path): os.mkdir(path, 755) # Visualize the model # print(model.summary()) # train for i in range(total_epochs): print(f'epoch: {i + 1}/{total_epochs}') history = model.fit(x_train, y_train, epochs=1, batch_size=batchSize) model.save(f'{path}epoch_{i}.h5') loss += history.history['loss'] # save with open(f'{path}loss', 'wb') as fp: pickle.dump(loss, fp)
def predict(epoch=300, dim=4, win=49, pred=20, freq=5): # epoch = 300 # dim = 4 # win = 60 # pred = 20 # freq = 5 # Get data real_stock_price = getGT(pred, freq) [x_test, y_test], scaler_list = getData(dim, win, pred, freq, "test") # Model predict K.clear_session() model_name = f"./model/draw/epoch_{epoch},dim_{dim},win_{win},freq_{freq}.h5" if not os.path.isfile(model_name): return "no such model!" model = load_model(model_name) model_output = model.predict(x_test) # get all the close price close_price = [] for j in range(len(model_output)): close_price.append(model_output[j][0]) # re-scale back close_price = np.reshape(close_price, (1, -1)) predicted_stock_price = scaler_list[0].inverse_transform(close_price) return predicted_stock_price[0]
def run(num_feature, isCF, isDF, isCustom): lst_train_text, lst_train_stars = iohelper.readTrain() lst_dev_text = iohelper.readDev() lst_test_text = iohelper.readTest() stop_words = iohelper.readStopWords() lst_train_text = [ preprocess.preprocess(text, stop_words) for text in lst_train_text ] lst_train_BOW = [preprocess.toBOW(text) for text in lst_train_text] lst_dev_text = [ preprocess.preprocess(text, stop_words) for text in lst_dev_text ] lst_dev_BOW = [preprocess.toBOW(text) for text in lst_dev_text] lst_test_text = [ preprocess.preprocess(text, stop_words) for text in lst_test_text ] lst_test_BOW = [preprocess.toBOW(text) for text in lst_test_text] print "PREPROCESS FINISHED!" if isCustom: train_data, dev_data, test_data = preprocess.getData_custom( lst_train_BOW, lst_dev_BOW, lst_test_BOW, num_feature, isCF, isDF) else: train_data, dev_data, test_data = preprocess.getData( lst_train_BOW, lst_dev_BOW, lst_test_BOW, num_feature, isCF, isDF) print "DATA MATRIX GENERATED!" W = multiLR.train(train_data, lst_train_stars) print "START PREDICT ON DEVELOPMENT DATA!" lst_dev_hard, lst_dev_soft = multiLR.pred(dev_data, W) iohelper.writeDevPred(lst_dev_hard, lst_dev_soft) print "START PREDICT ON TEST DATA!" lst_test_hard, lst_test_soft = multiLR.pred(test_data, W) iohelper.writeTestPred(lst_test_hard, lst_test_soft)
def genData(): lst_train_text, lst_train_stars = iohelper.readTrain() lst_dev_text = iohelper.readDev() stop_words = iohelper.readStopWords() lst_train_text = [ preprocess.preprocess(text, stop_words) for text in lst_train_text ] lst_train_BOW = [preprocess.toBOW(text) for text in lst_train_text] lst_dev_text = [ preprocess.preprocess(text, stop_words) for text in lst_dev_text ] lst_dev_BOW = [preprocess.toBOW(text) for text in lst_dev_text] print "PREPROCESS FINISHED!" train_data, dev_data, _ = preprocess.getData(lst_train_BOW, lst_dev_BOW, [], 2000, isCF=False, isDF=True) print "DATA MATRIX GENERATED!" writeTrain(train_data, lst_train_stars, "../data/svm_train") writeTest(dev_data, "../data/svm_dev")
import pandas as pd import matplotlib.pyplot as plt from keras import backend as K from keras.models import load_model from preprocess import getData ## Variable MSE = [] total_epochs = 5 input_dim = 4 window_size = 60 predict_days = 20 # Get data [x_test, y_test], scaler_list = getData(input_dim, window_size, predict_days, "test", True) ## Model predict path = f"./model/class/epoch_{total_epochs},dim_{input_dim},win_{window_size}/" for i in range(total_epochs): start = time.time() K.clear_session() # load model model = load_model(f'{path}epoch_{i}.h5') print(f'read model: epoch_{i}.h5') output = model.predict(x_test) acc = 0 total_guess = predict_days for j in range(len(output)):
def main(): out_dir = os.getcwd() parser = argparse.ArgumentParser( description='Get Training path and Glove path') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--data', '-d', default=out_dir, help='data path') parser.add_argument('--glove', '-w', default=out_dir, help='File to read glove vectors') parser.add_argument('-t', action='store_true', default=False, dest='testing') parser.add_argument('--setvocab', '-v', default=None, help="Providing already setup vocab") parser.add_argument('--setvectors', '-c', default=None) parser.add_argument('--model', '-m', default='NTIFullTreeMatching') args = parser.parse_args() gpu = args.gpu train_path = os.path.join(args.data, 'sts2016_train.stasis.csv') glove_path = os.path.join(args.glove, 'glove.840B.300d.txt') vocab_path = args.setvocab vectors_path = args.setvectors model_name = args.model logging_file = "train.log" FORMAT = "%(asctime)-15s %(message)s" if (logging_file is None): logging.basicConfig(format=FORMAT, level=logging.DEBUG) else: logging.basicConfig(filename=logging_file, format=FORMAT, level=logging.DEBUG) logging.info("The output directory is %s", out_dir) logging.info("The glove path is %s", glove_path) logging.info("The data path is %s", train_path) if (gpu < 0): logging.info("The program is running for CPU") else: logging.info("The program is running for GPU") n_epoch = 40 # number of epochs n_units = 300 # number of units per layer batch_size = 32 # minibatch size eval_batch = 64 max_dev = 0 max_tr = 0 max_test = 0 max_epch = 0 EMPTY = np.random.uniform(-0.1, 0.1, (1, 300)).astype(np.float32) #preprocessing vocab = None vectors = None data = preprocess.getData(train_path) if (vocab_path is not None): vocab = pickle.load(open(vocab_path, "r")) logging.info("Vocabulary loaded") else: vocab, frequency_dictionary = preprocess.create_vocab(data) pickle.dump(vocab, open(os.path.join(out_dir, "train_vocab.pkl"), "w")) logging.info("Dumping the training vocabulary in %s", out_dir + "/train_vocab.pkl") logging.info("Vocabulary created") if (vectors_path is None): vectors = preprocess.get_vectors(vocab, glove_path, out_dir) logging.info("Vectors formed") else: vectors = pickle.load(open(vectors_path, "r")) logging.info("Vectors loaded") train_data, validate_data = train_test_split(data, test_size=0.1) train_data = train_data.as_matrix() validate_data = validate_data.as_matrix() train_dataset, train_labels = preprocess.prepare_sentence_data( train_data, vectors) dataset, labels = preprocess.prepare_sentence_data(validate_data, vectors) validation_dataset, validation_labels = dataset, labels #test_dataset,test_labels = dataset,labels if (args.testing): logging.info("Just Testing!") train_dataset, train_labels = train_dataset[0:100], train_labels[0:100] validation_dataset, validation_labels = dataset[0:100], labels[0:100] #test_dataset,test_labels = dataset[-100:],labels[-100:] logging.info("The training size is %d", len(train_labels)) logging.info("The validation size is %d", len(validation_labels)) #logging.info("The test size is %d",len(test_labels)) model = None if model_name == "BILSTM": model = BILSTM(n_units, gpu) else: model = NTIFullTreeMatching(n_units, gpu) model.init_optimizer() n_train = len(train_labels) n_dev = len(validation_labels) #n_test = len(test_labels) logging.debug("Training Begins") #training code for i in xrange(0, n_epoch): logging.debug("epoch={}".format(i)) #Shuffle the data shuffle = np.random.permutation(n_train) preds = [] preds_true = [] aLoss = 0 ss = 0 begin_time = time.time() for j in six.moves.range(0, n_train, batch_size): c_b = shuffle[j:min(j + batch_size, n_train)] ys = preprocess.batch(train_labels, c_b) preds_true.extend(ys) y_data = np.array(ys, dtype=np.int32) sent_batch = preprocess.batch(train_dataset, c_b) sent_batch = preprocess.stack_pairs(sent_batch) y_s, loss = model.train(sent_batch, y_data) aLoss = aLoss + loss.data preds.extend(y_s) ss = ss + 1 logging.debug("loss:%f", aLoss / ss) logging.debug('secs per train epoch={}'.format(time.time() - begin_time)) f1_tr = accuracy_score(preds_true, preds) logging.debug('train accuracy_score={}'.format(f1_tr)) logging.debug(confusion_matrix(preds_true, preds)) preds = [] preds_true = [] for j in six.moves.range(0, n_dev, eval_batch): ys = validation_labels[j:j + eval_batch] preds_true.extend(ys) y_data = np.array(ys, dtype=np.int32) sent_batch = validation_dataset[j:j + eval_batch] sent_batch = preprocess.stack_pairs(sent_batch) y_s = model.predict(sent_batch) preds.extend(y_s) f1_dev = accuracy_score(preds_true, preds) logging.debug('dev accuracy_score={}'.format(f1_dev)) logging.debug(confusion_matrix(preds_true, preds)) if f1_dev > max_dev: max_dev = f1_dev max_tr = f1_tr max_epch = i logging.info('saving model') model.save(out_dir + '/' + model_name + '.' + str(i)) logging.info( "best results so far (dev): epoch=%d dev f1-score=%d test f1-score=%d", max_epch, max_dev, max_test)
# TODO: Have more sophisticated argument parser conf_file = sys.argv[1] # Create conf object. Can now access params, including db connection params = Conf(conf_file) # If topics weren't specified use all the topics # Each document should be one line (submission + comments) if not params.topics: topics = params.db.subreddit_list() else: topics = params.topics # Get the documents in Reddit format print "Retrieving documents..." reddit_documents = preprocess.getData(topics, params.comment_level, params.num_docs, params.db) print reddit_documents # Preprocess these print "Preprocessing documents..." preprocess.preprocess(reddit_documents, params.max_word_length, params.min_word_length, params.stopwords, params.stem) print reddit_documents ''' TODO: removal_threshold and removal_perc ''' # Split into train and test print "Splitting into train and test sets..." train,test = utils.partition(reddit_documents, .9) # Now save metadata to db to remember parameter configuration print "Saving metadata to mongodb..." metadata = utils.createMetaData(params) result = params.db.add_metadata(metadata)
def testClassification(): threashold = 0.001 corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [ list(x) for x in zip(*gensim.matutils.corpus2dense( corpus, corpus.num_terms, dtype=np.float64)) ] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_tests/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledCorpus = [] unlabelledCorpus = [] # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append(corpusList[i]) for i in range( preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES[j]])): unlabelledCorpus.append(corpusList[i]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels) - (list(labels).count("positive")) * 2)): break labelledCorpus = np.delete(labelledCorpus, deletes, axis=0) labels = np.delete(labels, deletes) ################## numData = len(labels) # size of the labelled data set # build classifier classifier = svm.SVC(kernel='linear').fit(labelledCorpus, labels) # compute output label and corresponding score output_test = classifier.predict(unlabelledCorpus) output_scores_test = classifier.decision_function(unlabelledCorpus) # sort scores and labels in order sortList = list( zip(output_scores_test, output_test, unlabelledCorpus)) sortList.sort() output_scores_test, output_test, unlabelledCorpus = zip(*sortList) # save result to file for r in range(len(unlabelledCorpus)): if (abs(output_scores_test[r]) < threashold): reportIdx = corpusList.index(list(unlabelledCorpus[r])) writer.writerow("") writer.writerow( [reportIdx, output_scores_test[r], output_test[r]]) writer.writerow([reports[reportIdx]]) writeFile.close()
def main(): total_data = preprocess.getData(img_size,num_total_data) # first 80% rows train_data = total_data[:int(num_total_data * 0.8)] train_feature = np.array([row[0] for row in train_data]).astype(np.float32) train_age_label = np.array([row[2] for row in train_data]).astype(np.int).flatten() # age train_gender_label = np.array([row[3] for row in train_data]).astype(np.int).flatten() # gender # the last 20% rows eval_data = total_data[int(num_total_data * 0.8):] eval_feature = np.array([row[0] for row in eval_data]).astype(np.float32) eval_age_label = np.array([row[2] for row in eval_data]).astype(np.int).flatten() # age eval_gender_label = np.array([row[3] for row in eval_data]).astype(np.int).flatten() # age # build the classifier age_classifier = tf.estimator.Estimator(neural_network_age_model_fn, model_dir=age_model_dir) gender_classifier = tf.estimator.Estimator(neural_network_gender_model_fn, model_dir=gender_model_dir) # train the model age_train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_feature}, y=train_age_label, batch_size=125, num_epochs=None, shuffle=True) age_classifier.train(input_fn=age_train_input_fn, steps=num_steps) gender_train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_feature}, y=train_gender_label, batch_size=125, num_epochs=None, shuffle=True) gender_classifier.train(input_fn=gender_train_input_fn, steps=num_steps) # evaluate the model age_eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_feature}, y=eval_age_label, num_epochs=1, shuffle=False) age_eval_results = age_classifier.evaluate(input_fn=age_eval_input_fn) print("age evaluation results:" + str(age_eval_results)) gender_eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_feature}, y=eval_gender_label, num_epochs=1, shuffle=False) gender_eval_results = gender_classifier.evaluate(input_fn=gender_eval_input_fn) print("gender evaluation results:" + str(gender_eval_results)) # print confusion matrix for age age_predict_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_feature}, num_epochs=1, shuffle=False) age_predictions = list(age_classifier.predict(age_predict_input_fn)) predicted_ages = [p["classes"] for p in age_predictions] age_confusion_matrix = tf.confusion_matrix(labels=eval_age_label, predictions=predicted_ages, num_classes=age_num_classes) with tf.Session(): print('Age Confusion Matrix: \n\n', tf.Tensor.eval(age_confusion_matrix,feed_dict=None, session=None)) # print confusion matrix for gender gender_predict_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_feature}, num_epochs=1, shuffle=False) gender_predictions = list(gender_classifier.predict(gender_predict_input_fn)) predicted_genders = [p["classes"] for p in gender_predictions] gender_confusion_matrix = tf.confusion_matrix(labels=eval_gender_label, predictions=predicted_genders, num_classes=gender_num_classes) with tf.Session(): print('Gender Confusion Matrix: \n\n', tf.Tensor.eval(gender_confusion_matrix,feed_dict=None, session=None))
from preprocess import getData from preprocess import getGT from keras import backend as K from keras.models import load_model ## Variable MSE = [] total_epochs = 300 input_dim = 4 window_size = 60 predict_days = 20 data_frequency = 1 ## Get data real_stock_price = getGT(predict_days, data_frequency) [x_test, y_test], scaler_list = getData(input_dim, window_size, predict_days, data_frequency, "test") ## Model predict # load model # path = f"./model/epoch_{total_epochs},dim_{input_dim},win_{window_size}/" path = f"./model/draw/" lstm = load_model(f'{path}000.h5') label1 = 'test1' # rnn = load_model(f'{path}50unit.h5') # label2 = 'test2' lstm_output = lstm.predict(x_test) # rnn_output = rnn.predict(x_test) # get all the close price lstm_close_price = [] for j in range(len(lstm_output)):
def testClassification(threshold,fileType): REPORT_FILES = [('Cleaned' + fileType + 'Full.csv')] REPORT_FILES_LABELLED = [('Cleaned' + fileType + 'Labelled.csv')] DIAGNOSES = [fileType] corpus = gensim.corpora.MmCorpus('../model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports(fileType) numFolds = 5 # number of folds for cross validation with open("labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): #writer.writerow("") #writer.writerow("") writer.writerow([DIAGNOSES[j],"",""]) # Added "" for csv parsing # fetch corpus and labels labelledCorpus = [] unlabelledCorpus = [] # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append(corpusList[i]) for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])): unlabelledCorpus.append(corpusList[i]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set # build classifier classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels) print "" print "Model parameters:" print classifier.coef_ print "" print "L2 norm of current model: " + str(np.linalg.norm(classifier.coef_)) print "" for i in range(len(classifier.coef_)): parameters.append(classifier.coef_[i]) # compute output label and corresponding score output_test = classifier.predict(unlabelledCorpus) output_scores_test = classifier.decision_function(unlabelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,unlabelledCorpus)) sortList.sort() output_scores_test,output_test,unlabelledCorpus = zip(*sortList) # save result to file for r in range(len(unlabelledCorpus)): if (abs(output_scores_test[r]) < threshold): reportIdx = corpusList.index(list(unlabelledCorpus[r])) # writer.writerow("") # Removing newline to help with future parsing writer.writerow([reportIdx,output_scores_test[r],output_test[r]]) writer.writerow([reports[reportIdx],"",""]) # Added extra "" to make csv parsing work writeFile.close() # Write model parameters to file with open("coef.csv",'w') as fout: writer = csv.writer(fout) for i in range(len(parameters)): writer.writerow(parameters[i]) print "Model parameters saved to file."
def labelClassificationD2V(): model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model") reports = preprocess.getReports() processedReports = preprocess.getProcessedReports() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(model.infer_vector(processedReports[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels) - (list(labels).count("positive")) * 2)): break labelledCorpus = np.delete(labelledCorpus, deletes, axis=0) labels = np.delete(labels, deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData / numFolds)) for n in range(0, numFolds): # split training and test data train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split( labelledCorpus, labels, test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit( train_labelledCorpus, train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function( test_labelledCorpus) output_scores_train = classifier.decision_function( train_labelledCorpus) # sort scores and labels in order sortList = list( zip(output_scores_test, output_test, test_labels, test_labelledCorpus)) sortList.sort() output_scores_test, output_test, test_labels, test_labelledCorpus = zip( *sortList) # build roc curve and plot fp_test, tp_test, _ = roc_curve(test_labels, output_scores_test, pos_label="positive") fp_train, tp_train, _ = roc_curve(train_labels, output_scores_train, pos_label="positive") plt.plot(fp_test, tp_test, 'r', label="train" if n == 0 else "") plt.plot(fp_train, tp_train, 'b', label="test" if n == 0 else "") plt.legend(loc='lower right') plt.savefig(directory + name) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([ output_scores_test[r], output_test[r], test_labels[r] ]) writer.writerow([labelledReports[reportIdx]]) # plt.show() writeFile.close()
def labelClassificationRNN(learn=True): if learn: c_vals = [[0.001,0.001,0.001,0.001]] c_vals = [[0.005,0.005,0.005,0.005]] c_vals.append([0.01,0.01,0.01,0.01]) c_vals.append([0.05,0.05,0.05,0.05]) c_vals.append([0.1,0.1,0.1,0.1]) c_vals.append([0.5,0.5,0.5,0.5]) c_vals.append([1,1,1,1]) optimal_c = [[0,0,0,0]] else: file = open('./model_files/svm_c_values.pkl', 'r') c_vals = pickle.load(file) optimal_c = c_vals file.close() reports = preprocess.getReports() reportVectors = rnn.loadReportVecs() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(reportVectors[i][:]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. # count = 0 # deletes = [] # for x in range(len(labels)): # if (labels[x] == "negative"): # count = count + 1 # deletes.append(x) # if (count == (len(labels)-(list(labels).count("positive"))*2)): # break # labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) # labels = np.delete(labels,deletes) ################## best_area_cv = -1 for c_value in c_vals: for n in range(numFolds): # split training and test data train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.15) # Split of the last 20% of training set for cross validation cv_labelledCorpus = train_labelledCorpus[int(0.8*len(train_labelledCorpus)):] train_labelledCorpus = train_labelledCorpus[:int(0.8*len(train_labelledCorpus))] cv_labels = train_labels[int(0.8*len(train_labels)):] train_labels = train_labels[:int(0.8*len(train_labels))] # build classifier classifier = svm.SVC(C=c_value[j],kernel='linear').fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_cv = classifier.predict(cv_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function(test_labelledCorpus) output_scores_train = classifier.decision_function(train_labelledCorpus) output_scores_cv = classifier.decision_function(cv_labelledCorpus) if n ==0: all_test_labels = tuple(test_labels) all_output_scores_test = tuple(output_scores_test) all_cv_labels = tuple(cv_labels) all_output_scores_cv = tuple(output_scores_cv) all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + tuple(test_labels) all_output_scores_test = all_output_scores_test + tuple(output_scores_test) all_cv_labels = all_cv_labels + tuple(cv_labels) all_output_scores_cv = all_output_scores_cv + tuple(output_scores_cv) all_train_labels = all_train_labels + tuple(train_labels) all_output_scores_train = all_output_scores_train+ tuple(output_scores_train) # save result for fold to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("With c value: "+str(c_value[j])) writer.writerow([output_scores_test[r],output_test[r],test_labels[r]]) writer.writerow([labelledReports[reportIdx]]) # generate the roc curve fp_test,tp_test,_ = roc_curve(all_test_labels,all_output_scores_test,pos_label="positive") fp_cv,tp_cv,_ = roc_curve(all_cv_labels,all_output_scores_cv,pos_label="positive") fp_train,tp_train,_ = roc_curve(all_train_labels,all_output_scores_train,pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_cv = auc(fp_cv, tp_cv) area_train = auc(fp_train, tp_train) # Store c value,tps, fps and aucs if cv auc is new best if area_cv > best_area_cv: optimal_c[0][j] = c_value[j] best_fp_test=fp_test best_tp_test=tp_test best_fp_cv=fp_cv best_tp_cv=tp_cv best_fp_train=fp_train best_tp_train=tp_train best_area_test=area_test best_area_cv=area_cv best_area_train=area_train # initialise and plot the average ROC curves for optimal c value name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC: c value of "+str(optimal_c[0][j])) plt.plot(best_fp_test,best_tp_test,'b',label='test(area = %0.2f)' % best_area_test) plt.plot(best_fp_cv,best_tp_cv,'g',label='cv(area = %0.2f)' % best_area_cv) plt.plot(best_fp_train,best_tp_train,'r',label='train(area = %0.2f)' % best_area_train) plt.legend(loc='lower right') plt.savefig(directory+name) writeFile.close() if learn: file = open('./model_files/svm_c_values.pkl', 'w') pickle.dump(optimal_c,file) file.close()
# reduction = 'lsa' # method = 'l2' # or frobenius # dimensions = 500 # # DISTANCE CLASSIFICATION FUNCTION # distance = 'cosine' # # NUMBER OF NEIGHBORS # neighbors = 1 ''' Data ''' # Get a list of RedditDocument objects rdb = db.RedditDB("blacksun.cs.mcgill.ca", 31050, 'ejacques', 'shellcentershell', "reddit_topics") documents = P.getData(topics, comment_level, num_docs, rdb) print "done getting documents:", len(documents) ''' Preprocessing ''' # Apply some basic preprocessing functions to it. # Default is not to stem. P.preprocess(documents, max_word_length=max_word_length, min_word_length=min_word_length, stopwords=stopwords) # Add websites to documents P.addWebsites(documents) print "done preprocessing" # Divide into training and test set
def labelClassificationD2V(): model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model") reports = preprocess.getReports() processedReports = preprocess.getProcessedReports() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(model.infer_vector(processedReports[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData/numFolds)) for n in range(0,numFolds): # split training and test data train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function(test_labelledCorpus) output_scores_train = classifier.decision_function(train_labelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus)) sortList.sort() output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList) # build roc curve and plot fp_test,tp_test,_ = roc_curve(test_labels,output_scores_test,pos_label="positive") fp_train,tp_train,_ = roc_curve(train_labels,output_scores_train,pos_label="positive") plt.plot(fp_test,tp_test,'r',label="train" if n == 0 else "") plt.plot(fp_train,tp_train,'b',label="test" if n == 0 else "") plt.legend(loc='lower right') plt.savefig(directory+name) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([output_scores_test[r],output_test[r],test_labels[r]]) writer.writerow([labelledReports[reportIdx]]) # plt.show() writeFile.close()
def labelClassification(): corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" if not os.path.exists(directory): os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledCorpus = [] # print(range(getNumReports(REPORT_FILES[:j]),getNumReports(REPORT_FILES[:j])+getNumReports([REPORT_FILES_LABELLED[j]]))) # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append((corpusList[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData/numFolds)) for n in range(0,numFolds): # split training and test data train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels) # classifier = svm.LinearSVC(C=1.0).fit(train_labelledCorpus,train_labels) # classifier = neighbors.KNeighborsClassifier(n_neighbors=3).fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function(test_labelledCorpus) output_scores_train = classifier.decision_function(train_labelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus)) sortList.sort() output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList) if n ==0: all_test_labels = test_labels all_output_scores_test = output_scores_test all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + test_labels all_output_scores_test = all_output_scores_test + output_scores_test all_train_labels = all_train_labels + tuple(train_labels) all_output_scores_train = all_output_scores_train+ tuple(output_scores_train) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([output_scores_test[r],output_test[r],test_labels[r]]) writer.writerow([reports[reportIdx]]) # generate the roc curve fp_test,tp_test,_ = roc_curve(all_test_labels,all_output_scores_test,pos_label="positive") fp_train,tp_train,_ = roc_curve(all_train_labels,all_output_scores_train,pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_train = auc(fp_train, tp_train) # Plot the average ROC curves plt.plot(fp_test,tp_test,'b',label='test(area = %0.2f)' % area_test) plt.plot(fp_train,tp_train,'r',label='train(area = %0.2f)' % area_train) plt.legend(loc='lower right') plt.savefig(directory+name) writeFile.close()
from preprocess import getData import pickle #intervals = [15,30,60] intervals = [60] for i in intervals: data, data_unnormalized = getData(i) # pickle.dump(data,open(str(i)+".p","wb")) pickle.dump(data_unnormalized,open(str(i)+"_unnormalized.p","wb"))
topics = params.topics # Now save metadata to db to remember parameter configuration print "Saving metadata to mongodb..." metadata = utils.createMetaData(params) result = params.db.add_metadata(metadata) if result: print "Save successful." else: print "Save not successful." # Get the documents in Reddit format print "Retrieving documents..." trainpath = "../data/scala/llda_train_"+str(result)+".csv" testpath = "../data/scala/llda_test_"+str(result)+".csv" preprocess.getData(topics, params.comment_level, params.num_docs, params.db, trainpath, testpath, params.max_word_length, params.min_word_length, params.stopwords, params.stem) # Preprocess these #print "Preprocessing documents..." #preprocess.preprocess(reddit_documents, params.max_word_length, params.min_word_length, params.stopwords, params.stem) #''' TODO: removal_threshold and removal_perc ''' # Split into train and test #print "Splitting into train and test sets..." #train,test = utils.partition(reddit_documents, .9) # Print each document to file # Add metadata's db id to filename to be able to match up to metadata in db #timestamp = '_'.join(str(datetime.today()).split()) #ftrain = open(trainpath, "wt") #ftest = open(testpath, "wt")
# Will keep track of total counts word_frequencies = {} # Each subreddit has associated list of documents documents = {} # Each subreddit has submission- and comment-level date ranges dates = {} start = time.clock() # Get documents from mongodb while preprocessing and # counting overall word frequencies for topic in topics: st = time.clock() print topic s = time.clock() docs = P.getData([topic], comment_level, num_docs) # Find date range for this topic (at submission level) subsd, subed = utils.submission_date_range(docs) # Find date range for this topic (at comment level) commsd, commed = utils.comment_date_range(docs) dates[topic] = {"SSD":subsd, "SED": subed, "CSD": commsd, "CED":commed} print "\t",len(docs), "documents:", time.clock()-s s = time.clock() P.preprocess(docs, max_word_length=max_word_length, min_word_length=min_word_length, stopwords='long', stem=False) print "\tdone preprocessing:", time.clock()-s documents[topic] = docs print "\tcalculating total frequencies..." s = time.clock() for doc in docs:
def explore_data(): X, Y = getData() print(X.shape)
def testClassification(): threashold = 0.001 corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_tests/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/" os.makedirs(directory) with open(directory+"labelClassification.csv",'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score","output label","expected label","report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledCorpus = [] unlabelledCorpus = [] # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append(corpusList[i]) for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])): unlabelledCorpus.append(corpusList[i]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels)-(list(labels).count("positive"))*2)): break labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) labels = np.delete(labels,deletes) ################## numData = len(labels) # size of the labelled data set # build classifier classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels) # compute output label and corresponding score output_test = classifier.predict(unlabelledCorpus) output_scores_test = classifier.decision_function(unlabelledCorpus) # sort scores and labels in order sortList = list(zip(output_scores_test,output_test,unlabelledCorpus)) sortList.sort() output_scores_test,output_test,unlabelledCorpus = zip(*sortList) # save result to file for r in range(len(unlabelledCorpus)): if (abs(output_scores_test[r]) < threashold): reportIdx = corpusList.index(list(unlabelledCorpus[r])) writer.writerow("") writer.writerow([reportIdx,output_scores_test[r],output_test[r]]) writer.writerow([reports[reportIdx]]) writeFile.close()
def labelClassification(): corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm') #convert the corpus to a numpy matrix, take the transpose and convert it to a list corpusList = [ list(x) for x in zip(*gensim.matutils.corpus2dense( corpus, corpus.num_terms, dtype=np.float64)) ] # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]] reports = preprocess.getReports() numFolds = 5 # number of folds for cross validation # Create the output directory directory = "label_classification/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # initialise figure and plot name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC") # fetch corpus and labels labelledCorpus = [] # print(range(getNumReports(REPORT_FILES[:j]),getNumReports(REPORT_FILES[:j])+getNumReports([REPORT_FILES_LABELLED[j]]))) # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledCorpus.append((corpusList[i])) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. count = 0 deletes = [] for x in range(len(labels)): if (labels[x] == "negative"): count = count + 1 deletes.append(x) if (count == (len(labels) - (list(labels).count("positive")) * 2)): break labelledCorpus = np.delete(labelledCorpus, deletes, axis=0) labels = np.delete(labels, deletes) ################## numData = len(labels) # size of the labelled data set dataPerFold = int(math.ceil(numData / numFolds)) for n in range(0, numFolds): # split training and test data train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split( labelledCorpus, labels, test_size=0.13) # build classifier classifier = svm.SVC(kernel='linear').fit( train_labelledCorpus, train_labels) # classifier = svm.LinearSVC(C=1.0).fit(train_labelledCorpus,train_labels) # classifier = neighbors.KNeighborsClassifier(n_neighbors=3).fit(train_labelledCorpus,train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function( test_labelledCorpus) output_scores_train = classifier.decision_function( train_labelledCorpus) # sort scores and labels in order sortList = list( zip(output_scores_test, output_test, test_labels, test_labelledCorpus)) sortList.sort() output_scores_test, output_test, test_labels, test_labelledCorpus = zip( *sortList) if n == 0: all_test_labels = test_labels all_output_scores_test = output_scores_test all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + test_labels all_output_scores_test = all_output_scores_test + output_scores_test all_train_labels = all_train_labels + tuple(train_labels) all_output_scores_train = all_output_scores_train + tuple( output_scores_train) # save result to file for r in range(len(test_labels)): reportIdx = corpusList.index(list(test_labelledCorpus[r])) writer.writerow("") writer.writerow([ output_scores_test[r], output_test[r], test_labels[r] ]) writer.writerow([reports[reportIdx]]) # generate the roc curve fp_test, tp_test, _ = roc_curve(all_test_labels, all_output_scores_test, pos_label="positive") fp_train, tp_train, _ = roc_curve(all_train_labels, all_output_scores_train, pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_train = auc(fp_train, tp_train) # Plot the average ROC curves plt.plot(fp_test, tp_test, 'b', label='test(area = %0.2f)' % area_test) plt.plot(fp_train, tp_train, 'r', label='train(area = %0.2f)' % area_train) plt.legend(loc='lower right') plt.savefig(directory + name) writeFile.close()
from sklearn.utils import shuffle from preprocess import getData #### The following function gets an indicator matrix from the targets def y2indicator(y, K): N = len(y) ind = np.zeros((N, K)) for i in range(N): ind[i, y[i]] = 1 return ind #### Getting our data, shuffling it and defining the test and train sets X, Y = getData() X, Y = shuffle(X, Y) Y = Y.astype(np.int32) D = X.shape[1] K = len(set(Y)) X_train = X[:-100] Y_train = Y[:-100] Y_train_ind = y2indicator(Y_train, K) X_test = X[-100:] Y_test = Y[-100:] Y_test_ind = y2indicator(Y_test, K) #### Initializing the weights W = np.random.randn(D, K)
def labelClassificationRNN(learn=True): if learn: c_vals = [[0.001, 0.001, 0.001, 0.001]] c_vals = [[0.005, 0.005, 0.005, 0.005]] c_vals.append([0.01, 0.01, 0.01, 0.01]) c_vals.append([0.05, 0.05, 0.05, 0.05]) c_vals.append([0.1, 0.1, 0.1, 0.1]) c_vals.append([0.5, 0.5, 0.5, 0.5]) c_vals.append([1, 1, 1, 1]) optimal_c = [[0, 0, 0, 0]] else: file = open('./model_files/svm_c_values.pkl', 'r') c_vals = pickle.load(file) optimal_c = c_vals file.close() reports = preprocess.getReports() reportVectors = rnn.loadReportVecs() numFolds = 5 # number of folds for cross validation directory = "label_classification/" + datetime.datetime.now().strftime( '%m_%d_%H_%M') + "/" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "labelClassification.csv", 'w') as writeFile: writer = csv.writer(writeFile) writer.writerow(["score", "output label", "expected label", "report"]) for j in range(len(REPORT_FILES_LABELLED)): writer.writerow("") writer.writerow("") writer.writerow([DIAGNOSES[j]]) # fetch corpus and labels labelledReports = [] labelledCorpus = list() # The labeled data is at the start of the data set # Get the ids in the corpus of these first labeled examples for each class for i in range( preprocess.getNumReports(REPORT_FILES[:j]), preprocess.getNumReports(REPORT_FILES[:j]) + preprocess.getNumReports([REPORT_FILES_LABELLED[j]])): labelledReports.append(reports[i]) labelledCorpus.append(reportVectors[i][:]) labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j] ]))[:, 2] corpusList = [list(x) for x in labelledCorpus] ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE. # count = 0 # deletes = [] # for x in range(len(labels)): # if (labels[x] == "negative"): # count = count + 1 # deletes.append(x) # if (count == (len(labels)-(list(labels).count("positive"))*2)): # break # labelledCorpus = np.delete(labelledCorpus,deletes,axis=0) # labels = np.delete(labels,deletes) ################## best_area_cv = -1 for c_value in c_vals: for n in range(numFolds): # split training and test data train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split( labelledCorpus, labels, test_size=0.15) # Split of the last 20% of training set for cross validation cv_labelledCorpus = train_labelledCorpus[ int(0.8 * len(train_labelledCorpus)):] train_labelledCorpus = train_labelledCorpus[:int( 0.8 * len(train_labelledCorpus))] cv_labels = train_labels[int(0.8 * len(train_labels)):] train_labels = train_labels[:int(0.8 * len(train_labels))] # build classifier classifier = svm.SVC(C=c_value[j], kernel='linear').fit( train_labelledCorpus, train_labels) # compute output label and corresponding score output_test = classifier.predict(test_labelledCorpus) output_cv = classifier.predict(cv_labelledCorpus) output_train = classifier.predict(train_labelledCorpus) output_scores_test = classifier.decision_function( test_labelledCorpus) output_scores_train = classifier.decision_function( train_labelledCorpus) output_scores_cv = classifier.decision_function( cv_labelledCorpus) if n == 0: all_test_labels = tuple(test_labels) all_output_scores_test = tuple(output_scores_test) all_cv_labels = tuple(cv_labels) all_output_scores_cv = tuple(output_scores_cv) all_train_labels = tuple(train_labels) all_output_scores_train = tuple(output_scores_train) else: all_test_labels = all_test_labels + tuple(test_labels) all_output_scores_test = all_output_scores_test + tuple( output_scores_test) all_cv_labels = all_cv_labels + tuple(cv_labels) all_output_scores_cv = all_output_scores_cv + tuple( output_scores_cv) all_train_labels = all_train_labels + tuple( train_labels) all_output_scores_train = all_output_scores_train + tuple( output_scores_train) # save result for fold to file for r in range(len(test_labels)): reportIdx = corpusList.index( list(test_labelledCorpus[r])) writer.writerow("With c value: " + str(c_value[j])) writer.writerow([ output_scores_test[r], output_test[r], test_labels[r] ]) writer.writerow([labelledReports[reportIdx]]) # generate the roc curve fp_test, tp_test, _ = roc_curve(all_test_labels, all_output_scores_test, pos_label="positive") fp_cv, tp_cv, _ = roc_curve(all_cv_labels, all_output_scores_cv, pos_label="positive") fp_train, tp_train, _ = roc_curve(all_train_labels, all_output_scores_train, pos_label="positive") # Calculate the area under the curves area_test = auc(fp_test, tp_test) area_cv = auc(fp_cv, tp_cv) area_train = auc(fp_train, tp_train) # Store c value,tps, fps and aucs if cv auc is new best if area_cv > best_area_cv: optimal_c[0][j] = c_value[j] best_fp_test = fp_test best_tp_test = tp_test best_fp_cv = fp_cv best_tp_cv = tp_cv best_fp_train = fp_train best_tp_train = tp_train best_area_test = area_test best_area_cv = area_cv best_area_train = area_train # initialise and plot the average ROC curves for optimal c value name = DIAGNOSES[j] + " ROC" plt.figure(name) plt.xlabel("False Positive") plt.ylabel("True Positive") plt.title(DIAGNOSES[j] + " ROC: c value of " + str(optimal_c[0][j])) plt.plot(best_fp_test, best_tp_test, 'b', label='test(area = %0.2f)' % best_area_test) plt.plot(best_fp_cv, best_tp_cv, 'g', label='cv(area = %0.2f)' % best_area_cv) plt.plot(best_fp_train, best_tp_train, 'r', label='train(area = %0.2f)' % best_area_train) plt.legend(loc='lower right') plt.savefig(directory + name) writeFile.close() if learn: file = open('./model_files/svm_c_values.pkl', 'w') pickle.dump(optimal_c, file) file.close()
import sys import random import preprocess import search # process the search term if (len(sys.argv) < 2): print("ERROR: Please specify an input file") sys.exit() fileName = str(sys.argv[1]) fileText = [row.rstrip('\n') for row in open(fileName)] if (fileText[1] == "notonlyreturndates"): print("Req No.<,>Report Date<,>Report") elif (fileText[1] == "onlyreturndates"): print("Req No.<,>Report Date") else: print("ERROR: input file layout error") sys.exit() data = preprocess.getData() similarReports = search.search("lsi",50,fileText[0]) for reportIdx in similarReports: year = random.randint(2000,int(fileText[2][0:4])-1) month = random.randint(1,12) date = random.randint(1,28) if (fileText[1] == "notonlyreturndates"): print(data[reportIdx[0]][0] + "<,>" + str(year) + str(month).zfill(2) + str(date).zfill(2) + "<,>" + data[reportIdx[0]][1]) elif (fileText[1] == "onlyreturndates"): print(data[reportIdx[0]][0] + "<,>" + str(year) + str(month).zfill(2) + str(date).zfill(2))