from main_codes import get_data_splited from main_codes import write_on_file_L2RFormat as wr lables, qid, data = get_data_splited.get_data(fname="train.txt") n_features_out = len(data[0]) print("n_features_out = ", n_features_out) from pylmnn.lmnn import LargeMarginNearestNeighbor # lmnn = LargeMarginNearestNeighbor(n_neighbors=1) lmnn = LargeMarginNearestNeighbor(L=None, load=None, max_constr=10000000, max_iter=200, n_features_out=5, n_neighbors=7, random_state=1, save=None, tol=1e-05, use_pca=True, use_sparse=True, verbose=1) lmnn.fit(data, lables) # doctest: +ELLIPSIS data = lmnn.transform(data) wr.write_("train_transformed.txt", lables=lables, qids=qid, datas=data) test_lables, test_qid, test_data = get_data_splited.get_data(fname="test.txt") test_data = lmnn.transform(test_data) wr.write_("test_transformed.txt",
def main(): hdul = fits.open("deep.fits") hdulData = hdul[1].data print(hdulData) parser = argparse.ArgumentParser(description="This script runs kNN with the required parameters. You should look at those.") parser.add_argument("-c", "--catType", nargs=1, required=True, type=int, help="0 for DEEP, 1 for WIDE - Catalogue Type, 2 for complete dataset, including missing values") parser.add_argument("-f", "--fillMethod", nargs=1, required=False, type=int, help="0 for replacement of missing values with the column mean. 1 for column median.") parser.add_argument("-t", "--testType", nargs=1, required=True, type=int, help="0 for normal, 1 for sub-field test with train = ELAIS-S1, 2 = sub-field test with train = eCDFS") parser.add_argument("-C", "--colsType", nargs=1, required=True, type=int, help="#0 for radio, sIndex, 3.6, 4.5, 5.8, 8.0, g, r, i, z. 1 for radio, 3.6, 4.5, 5.8, 8.0, g, r, i, z. 2 for radio, 3.6, 4.5, g, r, i, z") parser.add_argument("-d", "--distType", nargs=1, required=True, type=int, help="1 for Manhattan, 2 for Euclidean, 99 for Mahalanobis") parser.add_argument("-b", "--bootstrapSize", nargs=1, required=False, type=int, help="Number of bootstrap intervals. Do not use if you don't want bootstrap") parser.add_argument("-p", "--preBin", nargs=1, required=False, type=bool, help="Should the data be pre-binned? Don't enter for no") parser.add_argument("-P", "--postBin", nargs=1, required=False, type=bool, help="Should the data be post-binned? Don't enter for no") parser.add_argument("-z", "--classification", nargs=1, required=False, type=bool, help="Classification or regression. True for classification Don't enter for no") parser.add_argument("-m", "--metricLearn", nargs=1, required=False, type=bool, help="Should metric learning be used? Don't enter for no") parser.add_argument("-M", "--method", nargs=1, required=False, type=int, help="The type of ML to use. Nothing for kNN, 1 Linear Regression, 2 for Random Forest, 3 for lasso regression, 4 for ridge regression") args = vars(parser.parse_args()) #Data, Tests and Columns to use catType = args["catType"][0] testType = args["testType"][0] colsType = args["colsType"][0] distType = args["distType"][0] FailureLimit = 0.15 nSplits = 10 #Used in k-Fold Cross Validation. if args["fillMethod"] == None: fillMethod = 0 else: fillMethod = args["fillMethod"][0] if args["bootstrapSize"] == None: bootstrapSize = False else: bootstrapSize = args["bootstrapSize"][0] binData = 15 if args["postBin"] == None: postBin = False else: postBin = args["postBin"][0] if args["preBin"] == None: preBin = False else: preBin = args["preBin"][0] if args["classification"] == None: classification = False else: classification = args["classification"][0] if args["metricLearn"] == None: metricLearn = False else: metricLearn = args["metricLearn"][0] if args["method"] == None: MLMethod = 0 else: MLMethod = args["method"][0] if MLMethod == 2: neighboursList = range(2,60) #Using Random Forest. Should be different! elif MLMethod == 0: neighboursList = range(2,20) else: neighboursList = [0] folderpath = "cat-" + str(catType).strip() + "_fillMethod-" + str(fillMethod).strip() + "_testType-" + str(testType).strip() + "_colsType-" + str(colsType).strip() + "_distType-" + str(distType).strip() folderpath = folderpath + "_boot-" + str(bootstrapSize).strip() + "_preBin-" + str(preBin).strip() + "_postBin-" + str(postBin).strip() folderpath = folderpath + "_class-" + str(classification).strip() + "_metricLearn-" + str(metricLearn).strip() + "_MLMethod-" + str(MLMethod) if not os.path.exists("Results"): os.makedirs("Results") os.chdir("Results") startTime = datetime.now() # if not os.path.exists(now.strftime("%d-%m-%Y")): # os.makedirs(now.strftime("%d-%m-%Y")) # os.chdir(now.strftime("%d-%m-%Y")) if not os.path.exists(folderpath): os.makedirs(folderpath) #Individual switches to modify "small" details logZ = False #Set to True if z should be log(z) #NOTE!!! There is no check on the below redshift modifications to make sure selections make sense! maxRedshift = None #Flag to set max redshift to keep minRedshift = None #Flag to set min redshift to keep np.random.seed(42) useLogRadio = False #Set to true to take the log of radio data. False otherwise useColoursOptical = False #Set to True to use Optical Colours instead of Magnitudes useIRMagnitudes = False #Set to True to use log(IRFlux) useColoursIR = False #Set to True to use IR Colours instead of Fluxes (Implies True to above) standardiseXVals = True #Set to True to standardise the X-Values (x_i - x_mean) / x_sd if catType == 0: # catalogue = "../ATLAS_CATALOGUE/ATLAS-SWIRE-GRC-merged-2017-11-07.fits" catalogue = "deep.fits" # catalogue = "/bigdata/users/postgrad/kluken/Masters/ATLAS_CATALOGUE/ATLAS_Reduced_NoSindex.fits" elif catType == 1: catalogue = "wide.fits" #catalogue = "/bigdata/users/postgrad/kluken/Masters/ATLAS_CATALOGUE/ATLAS_EMU_3.6_4.5_DES.fits" elif catType == 2: catalogue = "missing.fits" if colsType == 0: dataCols = ["z","Sp2","Sindex","flux_ap2_36","flux_ap2_45","flux_ap2_58","flux_ap2_80","MAG_APER_4_G","MAG_APER_4_R","MAG_APER_4_I","MAG_APER_4_Z"] dataType = [0,1,2,3,3,3,3,4,4,4,4] elif colsType == 1: dataCols = ["z","Sp2","flux_ap2_36","flux_ap2_45","flux_ap2_58","flux_ap2_80","MAG_APER_4_G","MAG_APER_4_R","MAG_APER_4_I","MAG_APER_4_Z"] dataType = [0,1,3,3,3,3,4,4,4,4] elif colsType == 2: dataCols = ["z","Sp2","flux_ap2_36","flux_ap2_45","MAG_APER_4_G","MAG_APER_4_R","MAG_APER_4_I","MAG_APER_4_Z"] dataType = [0,1,3,3,4,4,4,4] #Create instance of preprocessing class to clean data preprocess = functions.DataProcessing() #Open Fits Catalogue print(catalogue) print(folderpath) # hdul = fits.open("missing.fits") # hdulData = hdul[1].data # print("Time taken to open fitsFile: " + str(datetime.now() - startTime)) os.chdir(folderpath) if os.path.isfile("resultsPlot.pdf"): print(folderpath + " already complete") sys.exit() #Create catalogueData array from the redshift column catalogueData = np.reshape(np.array(hdulData.field(dataCols[0]), dtype=np.float32), [len(hdulData.field(dataCols[0])),1]) #Add the columns required for the test for i in range(1, len(dataCols)): catalogueData = np.hstack([catalogueData,np.reshape(np.array(hdulData.field(dataCols[i]), dtype=np.float32), [len(hdulData.field(dataCols[i])),1])]) fieldList = np.reshape(np.array(hdulData.field("field"), dtype=np.str), [len(hdulData.field("field")),1]) # print("Time taken to create catalogueData: " + str(datetime.now() - startTime)) catalogueData1 = pd.DataFrame(catalogueData) catalogueData1.to_csv("catalogueData1.csv", index=False) #Begin cleaning process #Remove items with missing redshifts missingRedshifts = np.where(catalogueData[:,0] <= 0)[0] cleanCatalogue = np.delete(catalogueData, missingRedshifts, 0) fieldList = np.delete(fieldList, missingRedshifts, 0) #Make sure values are all within "sane" ranges for i in range(1, len(dataCols)): cleanCatalogue[:,i] = preprocess.cleanData(cleanCatalogue[:,i], dataType[i], fillMethod) cleanCatalogue1 = pd.DataFrame(cleanCatalogue) cleanCatalogue1.to_csv("cleanCatalogue1.csv", index=False) #print("updated Categologe", cleanCatalogue.shape) #Removing min and max redshifts if set if minRedshift != None: killRedshift = np.where(cleanCatalogue[:,0] < minRedshift)[0] cleanCatalogue = np.delete(cleanCatalogue, killRedshift, 0) if maxRedshift != None: killRedshift = np.where(cleanCatalogue[:,0] > maxRedshift)[0] cleanCatalogue = np.delete(cleanCatalogue, killRedshift, 0) if postBin and not classification: temp = cleanCatalogue[:,0] temp, binEdges, binnedZ = binDataFunc(temp, binData) #Bin z values if preBin or classification: cleanCatalogue[:,0], binEdges, binnedZ = binDataFunc(cleanCatalogue[:,0], binData) #Take log(z) if logZ: cleanCatalogue[:,0] = np.log(cleanCatalogue[:,0]) #Use log(Radio) if useLogRadio: cleanCatalogue[:,1] = np.log(cleanCatalogue[:,1]) #Use Optical Colours if useColoursOptical: for i in range(-4, -2): cleanCatalogue[:,i] = cleanCatalogue[:,i] - cleanCatalogue[:,i+1] cleanCatalogue = cleanCatalogue[:,0:-1] #Use IR Colours. Each dataType has different column numbers for IR, hence need different solution to each. #Need to take log(IR Flux) to get them into "Magnitudes", which can then be used to calculate the difference #between each - "Colours". Given we lose a column of data going to colours, delete the last column. #Set useMagnitudes to False so we don't then take a log of a ratio of a log. if useColoursIR: if colsType == 0: for i in range(4,6): cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i]) for i in range(4,5): cleanCatalogue[:,i] = cleanCatalogue[:,i] - cleanCatalogue[:,i+1] cleanCatalogue = np.delete(cleanCatalogue, obj = 6, axis = 1) elif colsType == 1: for i in range(3,5): cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i]) for i in range(3,4): cleanCatalogue[:,i] = cleanCatalogue[:,i] - cleanCatalogue[:,i+1] cleanCatalogue = np.delete(cleanCatalogue, obj = 5, axis = 1) elif colsType == 2: for i in range(3,4): cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i]) cleanCatalogue[:,2] = cleanCatalogue[:,2] - cleanCatalogue[:,3] cleanCatalogue = np.delete(cleanCatalogue, obj = 3, axis = 1) useIRMagnitudes = False #Take the log(IR Flux) to get the IR "Magnitudes" if useIRMagnitudes: if colsType == 0: for i in range(4,6): cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i]) elif colsType == 1: for i in range(3,5): cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i]) elif colsType == 2: for i in range(3,4): cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i]) cleanCatalogue1 = pd.DataFrame(cleanCatalogue) cleanCatalogue1.to_csv("cleanCatalogue1.csv", index=False) print("cleaning done") #Standardising all xVals - (x_i - x_mean) / x_sd if standardiseXVals: for i in range(1, cleanCatalogue.shape[1]): cleanCatalogue[:,i] = (cleanCatalogue[:,i] - np.mean(cleanCatalogue[:,i])) / np.std(cleanCatalogue[:,i]) # print("Time taken to clean and pre-process cleanCatalogue: " + str(datetime.now() - startTime)) y_vals = cleanCatalogue[:,[0]] x_vals = cleanCatalogue[:,1:] num_features = x_vals.shape[1] predictionBootstrap = [] mseBootstrap = [] outlierBootstrap = [] # Split the data into train and test sets if testType == 0: #Withdraw our 30% test set np.random.seed(225) test_indices = np.random.choice(len(x_vals), round(len(x_vals)*0.3), replace=False) train_indices = np.array(list(set(range(len(x_vals))) - set(test_indices))) x_vals_train = x_vals[train_indices] x_vals_test = x_vals[test_indices] y_vals_train = y_vals[train_indices] y_vals_test = y_vals[test_indices] elif testType == 1: #Withdraw our test set x_vals_test = x_vals[np.where(fieldList == "CDFS ")[0]] y_vals_test = y_vals[np.where(fieldList == "CDFS ")[0]] #Find the training set x_vals_train = x_vals[np.where(fieldList == "ELAIS-S1")[0]] y_vals_train = y_vals[np.where(fieldList == "ELAIS-S1")[0]] elif testType == 2: #Withdraw our test set y_vals_test = y_vals[np.where(fieldList == "ELAIS-S1")[0]] x_vals_test = x_vals[np.where(fieldList == "ELAIS-S1")[0]] #Find the training set x_vals_train = x_vals[np.where(fieldList == "CDFS ")[0]] y_vals_train = y_vals[np.where(fieldList == "CDFS ")[0]] print("start") x_vals_train_df = pd.DataFrame(x_vals_train) y_vals_train_df = pd.DataFrame(y_vals_train) vals_train_df = pd.concat([y_vals_train_df, x_vals_train_df], ignore_index=True, axis=1) vals_train_df.to_csv('vals_train_df_test_type{dbname}.csv'.format(dbname=testType), index=False) print("train size", vals_train_df.shape) x_vals_test_df = pd.DataFrame(x_vals_test) y_vals_test_df = pd.DataFrame(y_vals_test) vals_test_df = pd.concat([y_vals_test_df, x_vals_test_df], ignore_index=True, axis=1) vals_test_df.to_csv('vals_test_df_test_type{dbname}.csv'.format(dbname=testType), index=False) print("test size", vals_test_df.shape) method_no = 6 if method_no == 1: imputed_file_name = "KNN_imputated_catalogueData1.csv" elif method_no == 2: imputed_file_name = "GAN_imputated_catalogueData1.csv" elif method_no == 3: imputed_file_name = "Mean_imputated_catalogueData1.csv" elif method_no == 4: imputed_file_name = "Median_imputated_catalogueData1.csv" elif method_no == 5: imputed_file_name = "MICE_imputated_catalogueData1.csv" else: imputed_file_name = "None" if imputed_file_name != "None": if not os.path.isfile(imputed_file_name): sourceFolderPath = "Source File Path/Results/" destFolderPath = sourceFolderPath + folderpath + "/" shutil.move(os.path.join(sourceFolderPath, imputed_file_name), destFolderPath) x_vals_test = np.loadtxt(imputed_file_name, delimiter=",", usecols=(range(1, 10)), skiprows=1) y_vals_test = np.loadtxt(imputed_file_name, delimiter=",", usecols=(0), skiprows=1) y_vals_test = y_vals_test.reshape(-1) print("x size is:", x_vals_test.shape) print("y size is: ", y_vals_test.shape) print("done") if not preBin and metricLearn: metricLearnModel = metricLearnRegression(x_vals_train, y_vals_train) x_vals_train = metricLearnModel.transform(x_vals_train) x_vals_test = metricLearnModel.transform(x_vals_test) if type(bootstrapSize) == int: predictionBootstrap = [] mseBootstrap = [] outlierBootstrap = [] # for i in tqdm(range(bootstrapSize)): for i in range(bootstrapSize): # Use metric learning if required if metricLearn and not classification: B = metricLearnRegression(x_vals_train, y_vals_train) x_vals_train = B.transform(x_vals_train) x_vals_test = B.transform(x_vals_test) # Split the data into train and test sets # Randomly sample our training set for bootstrapping train_indices = np.random.choice(len(y_vals_train), len(y_vals_train), replace=True) x_vals_train_bootstrap = x_vals_train[train_indices] y_vals_train_bootstrap = y_vals_train[train_indices] kFold = KFold(n_splits=nSplits, random_state=10, shuffle=True) MSE = [] Failed = [] # for numNeighbours in tqdm(neighboursList): for numNeighbours in neighboursList: mseList = [] failed = [] # for trainIndex, testIndex in tqdm(kFold.split(x_vals_train_bootstrap), total=nSplits): for trainIndex, testIndex in kFold.split(x_vals_train_bootstrap): x_vals_train_cross = x_vals_train_bootstrap[trainIndex] x_vals_test_cross = x_vals_train_bootstrap[testIndex] y_vals_train_cross = y_vals_train_bootstrap[trainIndex] y_vals_test_cross = y_vals_train_bootstrap[testIndex] if MLMethod == 0: pred, mseTest = kNN(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross, distType) elif MLMethod == 1: pred, mseTest = linRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross) elif MLMethod == 2: pred, mseTest = randomForestRegress(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross) elif MLMethod == 3: pred, mseTest = lassoRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross) elif MLMethod == 4: pred, mseTest = ridgeRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross) lengthOfSplit = len(pred) if logZ: error = np.abs(np.exp(pred) - np.exp(y_vals_test_cross)) failed.append(len(error[np.where(error > (FailureLimit * (1+np.exp(y_vals_test_cross))))[0]])/lengthOfSplit ) else: error = np.abs(pred - y_vals_test_cross) failed.append(len(error[np.where(error > (FailureLimit * (1+y_vals_test_cross)))[0]])/lengthOfSplit ) mseList.append(np.round(mseTest,3)) MSE.append(np.mean(mseList)) Failed.append(np.mean(failed)) mseBootstrap.append(MSE) outlierBootstrap.append(Failed) bestKIndex = (np.argmin(np.array(Failed))) bestK = neighboursList[bestKIndex] if MLMethod == 0: pred, mse_test = kNN(numNeighbours, x_vals_train_bootstrap, x_vals_test, y_vals_train_bootstrap, y_vals_test, distType) elif MLMethod == 1: pred, mse_test = linRegress(x_vals_train_bootstrap, x_vals_test, y_vals_train_bootstrap, y_vals_test) elif MLMethod == 2: pred, mse_test = randomForestRegress(numNeighbours, x_vals_train_bootstrap, x_vals_test, y_vals_train_bootstrap, y_vals_test) elif MLMethod == 3: pred, mse_test = lassoRegress(x_vals_train_bootstrap, x_vals_test, y_vals_train_bootstrap, y_vals_test) elif MLMethod == 4: pred, mse_test = ridgeRegress(x_vals_train_bootstrap, x_vals_test, y_vals_train_bootstrap, y_vals_test) if logZ: error = np.abs(np.exp(pred) - np.exp(y_vals_test)) testError = (len(error[np.where(error > (FailureLimit*(1+np.exp(y_vals_test))))[0]])/len(pred) ) else: error = np.abs(pred - y_vals_test) testError = (len(error[np.where(error > (FailureLimit*(1+y_vals_test)))[0]])/len(pred) ) if logZ: predictionBootstrap.append(np.exp(pred)) else: predictionBootstrap.append(pred) outlier_final = [] mse_final = [] kFold = KFold(n_splits=nSplits, random_state=10, shuffle=True) # for numNeighbours in tqdm(neighboursList): for numNeighbours in neighboursList: mseList = [] failed = [] # TODO: Need to turn this back on when regression metric learn is working. if metricLearn and preBin: # and classification: lmnn = LMNN(n_neighbors=numNeighbours, max_iter=200, n_features_out=x_vals_train.shape[1], verbose=None) # for trainIndex, testIndex in tqdm(kFold.split(x_vals_train), total=nSplits): for trainIndex, testIndex in kFold.split(x_vals_train): #Define training and test sets x_vals_train_cross = x_vals_train[trainIndex] x_vals_test_cross = x_vals_train[testIndex] y_vals_train_cross = y_vals_train[trainIndex] y_vals_test_cross = y_vals_train[testIndex] # TODO: Need to turn this back on when regression metric learn is working. if metricLearn and preBin: # and classification: lmnn.fit(x_vals_train_cross, np.squeeze(y_vals_train_cross.astype(str))) x_vals_train_cross = lmnn.transform(x_vals_train_cross) x_vals_test_cross = lmnn.transform(x_vals_test_cross) # Use metric learning if required if metricLearn and not classification: B = metricLearnRegression(x_vals_train_cross, y_vals_train_cross) x_vals_train_cross = B.transform(x_vals_train_cross) x_vals_test_cross = B.transform(x_vals_test_cross) if classification: if MLMethod == 0: pred, mseTest = kNN_classification(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross, distType) elif MLMethod == 1: pred, mseTest = logRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross) elif MLMethod == 2: pred, mseTest = randomForestClass(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross) else: if MLMethod == 0: pred, mseTest = kNN(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross, distType) elif MLMethod == 1: pred, mseTest = linRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross) elif MLMethod == 2: pred, mseTest = randomForestRegress(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross) elif MLMethod == 3: pred, mseTest = lassoRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross) elif MLMethod == 4: pred, mseTest = ridgeRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross) lengthOfSplit = len(pred) if logZ: error = np.abs(np.exp(pred) - np.exp(np.squeeze(y_vals_test_cross))) failed.append(len(error[np.where(error > (FailureLimit * (1+np.exp(np.squeeze(y_vals_test_cross)))))[0]])/lengthOfSplit ) else: error = np.abs(pred - np.squeeze(y_vals_test_cross)) failed.append(len(error[np.where(error > (FailureLimit * (1+np.squeeze(y_vals_test_cross))))[0]])/lengthOfSplit ) mseList.append(np.round(mseTest,3)) mse_final.append(np.mean(mseList)) outlier_final.append(np.mean(failed)) bestKIndex = (np.argmin(np.array(outlier_final))) bestK = neighboursList[bestKIndex] if classification: if metricLearn: lmnn = LMNN(n_neighbors=bestK, max_iter=200, n_features_out=x_vals_train.shape[1], verbose=None) lmnn.fit(x_vals_train, np.squeeze(y_vals_train.astype(str))) x_vals_train = lmnn.transform(x_vals_train) x_vals_test = lmnn.transform(x_vals_test) if MLMethod == 0: finalPrediction, finalMSE = kNN_classification(bestK, x_vals_train, x_vals_test, y_vals_train, y_vals_test, distType) if MLMethod == 1: finalPrediction, finalMSE = logRegress(x_vals_train, x_vals_test, y_vals_train, y_vals_test) if MLMethod == 2: finalPrediction, finalMSE = randomForestClass(bestK, x_vals_train, x_vals_test, y_vals_train, y_vals_test) else: # TODO: Need to remove/change this once regression metric learning is done. if metricLearn and preBin: lmnn = LMNN(n_neighbors=bestK, max_iter=200, n_features_out=x_vals_train.shape[1], verbose=None) lmnn.fit(x_vals_train, np.squeeze(y_vals_train.astype(str))) x_vals_train = lmnn.transform(x_vals_train) x_vals_test = lmnn.transform(x_vals_test) if MLMethod == 0: finalPrediction, finalMSE = kNN(bestK, x_vals_train, x_vals_test, y_vals_train, y_vals_test, distType) elif MLMethod == 1: finalPrediction, finalMSE = linRegress(x_vals_train, x_vals_test, y_vals_train, y_vals_test) elif MLMethod == 2: finalPrediction, finalMSE = randomForestRegress(bestK, x_vals_train, x_vals_test, y_vals_train, y_vals_test) elif MLMethod == 3: finalPrediction, finalMSE = lassoRegress(x_vals_train, x_vals_test, y_vals_train, y_vals_test) elif MLMethod == 4: finalPrediction, finalMSE = ridgeRegress(x_vals_train, x_vals_test, y_vals_train, y_vals_test) residuals = (np.squeeze(y_vals_test) - finalPrediction) / (1 + np.squeeze(y_vals_test)) if postBin: finalPrediction, temp, temp2 = binDataFunc(finalPrediction, binData, binEdges = binEdges, newZ = binnedZ) y_vals_test, temp, temp2 = binDataFunc(y_vals_test, binData, binEdges = binEdges, newZ = binnedZ) if postBin or classification: confusion = confusion_matrix(np.round(y_vals_test,2).astype(str),np.round(finalPrediction,2).astype(str))#.astype(float) #plotNormConfusionMatrix(confusion,binnedZ,binEdges) plotScaledConfusionMatrix(y_vals_test, finalPrediction, binEdges, binnedZ) mutualInfo = adjusted_mutual_info_score(np.squeeze(y_vals_test).astype(str),np.squeeze(finalPrediction).astype(str)) if logZ: error = np.abs(np.exp(finalPrediction) - np.exp(np.squeeze(y_vals_test))) testError = (len(error[np.where(error > (FailureLimit*(1+np.exp(np.squeeze(y_vals_test)))))[0]])/len(finalPrediction) ) else: error = np.abs(finalPrediction - np.squeeze(y_vals_test)) testError = (len(error[np.where(error > (FailureLimit*(1+np.squeeze(y_vals_test))))[0]])/len(finalPrediction) ) if type(bootstrapSize) == int: percentiles = np.percentile(predictionBootstrap, q=[2.5,97.5], axis=0) percentiles[0,:] = np.abs(finalPrediction - percentiles[0,:]) percentiles[1,:] = np.abs(percentiles[1,:] - finalPrediction) if classification or postBin: precision = metrics.precision_score(y_vals_test.ravel().astype(str), finalPrediction.ravel().astype(str), average="macro") recall = metrics.recall_score(y_vals_test.ravel().astype(str), finalPrediction.ravel().astype(str), average="macro") f1 = metrics.f1_score(y_vals_test.ravel().astype(str), finalPrediction.ravel().astype(str), average="macro") else: mse = metrics.mean_squared_error(y_vals_test.ravel(), finalPrediction.ravel()) predFile = "finalPredictions" yValsFile = "yValsFile" mseFile = "mseFile" outlierFile = "outlierFile" binEdgesFile = "binEdges" with open(predFile, "wb") as openFile: pickle.dump(finalPrediction, openFile) with open(yValsFile, "wb") as openFile: pickle.dump(y_vals_test, openFile) with open(mseFile, "wb") as openFile: pickle.dump(mse_final, openFile) with open(outlierFile, "wb") as openFile: pickle.dump(outlier_final, openFile) if postBin or classification: with open(binEdgesFile, "wb") as openFile: pickle.dump(binEdges, openFile) outlierRate = 100*len(residuals[np.where(abs(residuals)>0.15)])/len(residuals) with open("results.csv", "w") as openFile: if postBin or classification: openFile.write("bestK,numTrainSources,numTestSources,outlier,score,mutualInfo,residual_std_dev,precision,recall,f1,time\n") openFile.write(str(bestK) + "," + str(y_vals_train.shape[0]) + "," + str(y_vals_test.shape[0]) + "," + str(outlierRate) + "," + str(finalMSE) + "," + str(mutualInfo) + "," + str(np.std(residuals)) + "," + str(precision) + "," + str(recall) + "," + str(f1) + "," + str(datetime.now() - startTime)) else: openFile.write("bestK,numTrainSources,numTestSources,outlier,score,residual_std_dev,mse,time\n") openFile.write(str(bestK) + "," + str(y_vals_train.shape[0]) + "," + str(y_vals_test.shape[0]) + "," + str(outlierRate) + "," + str(finalMSE) + "," + str(np.std(residuals)) + "," + str(mse) + "," + str(datetime.now() - startTime)) #Find number of test sources to use in the plot titles if MLMethod != 3 and MLMethod != 4 and MLMethod != 1: plt.figure(0) if classification: plt.plot(neighboursList, np.array(mse_final), color="springgreen", label="Accuracy") else: plt.plot(neighboursList, np.array(mse_final), color="springgreen", label=r'R$^2$') plt.plot(neighboursList, np.array(outlier_final), color="deepskyblue", label="Failure Rate") plt.ylabel("Error Metric") if MLMethod == 0: plt.xlabel('Number of Neighbours') elif MLMethod == 2: plt.xlabel("Number of Trees") plt.axvline(bestK,color="red", alpha=0.5) plt.legend() plt.tight_layout() plt.grid() plt.savefig("cross_validation.pdf") if logZ: y_vals_test = np.exp(y_vals_test) if type(bootstrapSize) == bool: plotData(np.squeeze(y_vals_test), finalPrediction, plt, stats, pylab) else: plotData(np.squeeze(y_vals_test), finalPrediction, plt, stats, pylab, percentiles) plt.savefig("resultsPlot.pdf")
test_size=0.33, random_state=19) covX = np.cov(X_train, rowvar=False) h = .02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) print('done') for weights in ['uniform']: # we create an instance of Neighbours Classifier and fit the data. clf = LMNN(n_neighbors=n_neighbors, max_iter=150, n_features_out=X.shape[1]) clf.fit(X_train, y_train) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1 y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
X = [[0, 3], [1, 2], [2, 4], [3, 1.5]] y = [0, 0, 1, 1] from pylmnn.lmnn import LargeMarginNearestNeighbor # lmnn = LargeMarginNearestNeighbor(n_neighbors=1) lmnn = LargeMarginNearestNeighbor(L=None, load=None, max_constr=10000000, max_iter=200, n_features_out=1, n_neighbors=1, random_state=None, save=None, tol=1e-05, use_pca=True, use_sparse=True, verbose=1) lmnn.fit(X, y) # doctest: +ELLIPSIS print(lmnn.transform(X)) test = [[1.6, 1.6]] print(lmnn.predict(test)) print(lmnn.transform(test))
def find_topic(speechList): speechList = speechList[-28:] #from bush senior documents = [] for speech in speechList: tmp = [] for sent in speech[ 'text_lem']: #sent is a single sentence which is a list of words ss = [] for w in sent: #w is a single word w = re.compile('[%s]' % re.escape(string.punctuation)).sub( '', w) #replace punctuations in a word by '' if len(w) > 2: try: float(w) #do not consider numbers except: ss += [w] tmp += [ ' '.join(ss) ] #tmp is a list of sentences. sentence is a string (space separated words). tmp is a collection of sentences from a speech documents += [tmp] documents_sents = sum(documents, []) #list of sentences of all speeches print(len(documents_sents)) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=2000, stop_words='english') tf = tf_vectorizer.fit_transform(documents_sents) tf_feature_names = tf_vectorizer.get_feature_names() num_topics = 10 lda = LatentDirichletAllocation(n_topics=num_topics, max_iter=10, learning_method='online', learning_offset=50., random_state=0).fit(tf) possible_topics = [ 'education', 'jobs', 'world affairs', 'health care', 'middle east', 'terrorism', 'taxation', 'social programs', 'law and order', 'iraq war' ] #from later analysis display_topics(lda, tf_feature_names, 15, possible_topics) #now that we have the topic model, let us see, how each speech fares: topwords = [] for topic_idx, topic in enumerate(lda.components_): #topic is a np array of 2000 (num features) numbers tmp = topic.argsort()[:-15 - 1:-1] topwords.append({tf_feature_names[i]: topic[i] for i in tmp}) #topwords is of length #topics. topwords[i] is the top words (and scores) for topic i all_doc_fts = [] for docidx, doc in enumerate(documents): doc_ft = [] for sentidx, sent in enumerate(doc): words = sent.split(' ') topic_ft = [0] * len(topwords) #length 10 for word in words: for tp_idx, tp in enumerate(topwords): if word in tp: topic_ft[tp_idx] += 1 doc_ft += [topic_ft ] #doc_ft is num_of_sentences_in_doc x num_topics all_doc_fts += [doc_ft] finaldocft = np.array([ np.mean(np.array(all_doc_ft), 0) for all_doc_ft in all_doc_fts ]) #shape: 28 x 10 (num docs x num topics) speechinfo = [(speech['speaker'], speech['party']) for speech in speechList] top_speeches_using_topic = [] #who used topic i the most? for topicidx in range(len(topwords)): tmp = np.argsort(finaldocft[:, topicidx])[::-1][:5] top_speeches_using_topic.append( tmp) #top 3 speeches that use this topic tmp1 = [] for t in tmp: if speechinfo[t] not in tmp1: tmp1 += [speechinfo[t]] print('Topic ' + str(topicidx) + ': ' + possible_topics[topicidx]) #tmp1 = (set([speechinfo[t] for t in tmp])) print([i[0] + ' (' + i[1] + ')' for i in tmp1]) #each speech used which topics (top 3 topics per speech)? for idx, (speaker, party) in enumerate(speechinfo): tmp = np.argsort(finaldocft[idx, :])[::-1][:3] tp = ' '.join( ['Topic ' + str(i) + ' (' + possible_topics[i] + ')' for i in tmp]) print(speaker + ' used the following topics: ' + tp) #pdb.set_trace() ftmap = 0.1 * np.ones([finaldocft.shape[0], finaldocft.shape[0]]) #28 x 28 for idx1 in range(finaldocft.shape[0]): for idx2 in range(finaldocft.shape[0]): if idx1 != idx2: ftmap[idx1, idx2] = np.linalg.norm(finaldocft[idx1, :] - finaldocft[idx2, :]) sns.heatmap(np.log(ftmap)) plt.savefig('heatmap_topic.png') plt.close() #Now do metric-learning k_tr, dim_out, max_iter = 3, finaldocft.shape[1], 180 clf = LMNN(n_neighbors=k_tr, max_iter=max_iter, n_features_out=dim_out, verbose=False) class_labels = [0] * 3 + [1] * 8 + [2] * 9 + [3] * 8 clf = clf.fit(finaldocft, class_labels) #accuracy_lmnn = clf.score(finaldocft, class_labels) #print ('Metric learn accuracy: ', accuracy_lmnn) ftmap = 0.1 * np.ones([finaldocft.shape[0], finaldocft.shape[0]]) #28 x 28 for idx1 in range(finaldocft.shape[0]): for idx2 in range(finaldocft.shape[0]): if idx1 != idx2: ftmap[idx1, idx2] = np.linalg.norm( clf.transform([finaldocft[idx1, :]]) - clf.transform([finaldocft[idx2, :]])) sns.heatmap(np.log(ftmap)) plt.savefig('heatmap_topic_metric.png') plt.close()
def draw_heatmap(speechList): vocab = set([]) for idx, speech in enumerate(speechList): words_in_speech = sum(speech['text_lem'], []) words_in_speech_filt = filter(words_in_speech) vocab = vocab.union(set(words_in_speech_filt)) #vocab is a list of all unique words in all the speeches pdflist = {} try: pdflist = pkl.load(open('unigram_pdf.pkl', 'rb')) except: for idx, speech in enumerate(speechList): year = int(speech['date'].split(' ')[-1]) print (year,'xx') pdflist[year] = [speech['speaker'], get_pdf(filter(sum(speech['text_lem'],[])), vocab)] pkl.dump(pdflist, open('unigram_pdf.pkl', 'wb')) try: heatmapvals = pkl.load(open('heatmap.pkl', 'rb')) except: heatmapvals = np.zeros([len(range(1901, 2017)), len(range(1901, 2017))]) for year1 in range(1901, 2017): print(year1) for year2 in range(1901, 2017): #pdb.set_trace() try: kl1, terms1 = kldiv(pdflist[year1][-1], pdflist[year2][-1]) kl2, terms2 = kldiv(pdflist[year2][-1], pdflist[year1][-1]) heatmapvals[year1-1901][year2-1901] = 0.5*(kl1+kl2) except: continue pkl.dump(heatmapvals, open('heatmap.pkl', 'wb')) #create a (chronologically ordered) list of presidents, and their start years presis = []; presi_start_year = {} for year in range(1901, 2017): try: if pdflist[year][0] not in presis: presis += [pdflist[year][0]] presi_start_year[pdflist[year][0]] = year except: continue bigmap = helper(presis, heatmapvals, presi_start_year) #of size num_presis x num_presis np.savetxt('heatmap.csv', heatmapvals, delimiter=',') np.savetxt('heatmapbig.csv', bigmap, delimiter=',') #pdb.set_trace() #sns.heatmap(heatmapvals) sns.heatmap(bigmap) plt.savefig('heatmapbig.png') plt.close() sns.heatmap(heatmapvals[-28:, -28:]) #bush senior to obama only plt.savefig('heatmapzoom.png') plt.close() #pdb.set_trace() from sklearn.decomposition import PCA keys = pdflist[2001][1].keys() unigramft = np.array([[pdflist[yr][1][k] for k in keys] for yr in range(1989, 2017)]) #pdb.set_trace() pca = PCA(n_components=10) print ('start PCA') pca.fit(unigramft) print ('fitted PCA') newfts = pca.transform(unigramft) print ('transformed PCA') #pdb.set_trace() try: heatmapvals_zoom_pca = pkl.load(open('heatmap_zoom_pca.pkl', 'rb')) except: heatmapvals_zoom_pca = np.zeros([len(range(1989, 2017)), len(range(1989, 2017))]) for year1 in range(1989, 2017): print(year1) for year2 in range(1989, 2017): #pdb.set_trace() try: #pdb.set_trace() f1 = newfts[year1-1989,:] f2 = newfts[year2-1989,:] heatmapvals_zoom_pca[year1-1989][year2-1989] = np.linalg.norm(f1-f2) except: continue pkl.dump(heatmapvals_zoom_pca, open('heatmap_zoom_pca.pkl', 'wb')) sns.heatmap(heatmapvals_zoom_pca) #bush senior to obama only plt.savefig('heatmap_zoom_pca.png') plt.close() k_tr, dim_out, max_iter = 3, newfts.shape[1], 180 clf = LMNN(n_neighbors=k_tr, max_iter=max_iter, n_features_out=dim_out, verbose=False) class_labels = [0]*3 + [1]*8 + [2]*9 + [3]*8 clf = clf.fit(newfts, class_labels) ftmap = 0.1*np.ones([newfts.shape[0],newfts.shape[0]]) #28 x 28 for idx1 in range(newfts.shape[0]): for idx2 in range(newfts.shape[0]): if idx1!=idx2: ftmap[idx1,idx2] = np.linalg.norm(clf.transform([newfts[idx1,:]]) - clf.transform([newfts[idx2,:]])) sns.heatmap(np.log(ftmap)) plt.savefig('heatmap_pca_metric.png') plt.close()
# Mahalanobis k = 1 clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights, metric='mahalanobis', metric_params={'V': covX}) clf.fit(X_train, y_train) acc = clf.score(X_test, y_test) print(acc) mahanalobisresults.append(acc) # lmnn clf = LMNN(n_neighbors=n_neighbors, max_iter=150, n_features_out=X.shape[1]) clf.fit(X_train, y_train) acc = clf.score(X_test, y_test) print(acc) lmnnresults.append(acc) print("Euclidean k=1 std:", np.std(euclidean1results), " mean: ", np.mean(euclidean1results)) print("Euclidean k=3 std:", np.std(euclidean3results), " mean: ", np.mean(euclidean3results)) print("Mahanalobis k=1 std:", np.std(mahanalobisresults), " mean: ", np.mean(mahanalobisresults)) print("LMNN k=1 std:", np.std(lmnnresults), " mean: ", np.mean(lmnnresults))
from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris from pylmnn.lmnn import LargeMarginNearestNeighbor as LMNN from pylmnn.plots import plot_comparison # Load a data set dataset = load_iris() X, y = dataset.data, dataset.target # Split in training and testing set x_tr, x_te, y_tr, y_te = train_test_split(X, y, test_size=0.7, stratify=y, random_state=42) # Set up the hyperparameters k_tr, k_te, dim_out, max_iter = 3, 1, X.shape[1], 180 # Instantiate the classifier clf = LMNN(n_neighbors=k_tr, max_iter=max_iter, n_features_out=dim_out) # Train the classifier clf = clf.fit(x_tr, y_tr) # Compute the k-nearest neighbor test accuracy after applying the learned transformation accuracy_lmnn = clf.score(x_te, y_te) print('LMNN accuracy on test set of {} points: {:.4f}'.format(x_te.shape[0], accuracy_lmnn)) # Draw a comparison plot of the test data before and after applying the learned transformation plot_comparison(clf.L_, x_te, y_te, dim_pref=3)