예제 #1
0
from main_codes import get_data_splited
from main_codes import write_on_file_L2RFormat as wr

lables, qid, data = get_data_splited.get_data(fname="train.txt")

n_features_out = len(data[0])
print("n_features_out = ", n_features_out)

from pylmnn.lmnn import LargeMarginNearestNeighbor
# lmnn = LargeMarginNearestNeighbor(n_neighbors=1)
lmnn = LargeMarginNearestNeighbor(L=None,
                                  load=None,
                                  max_constr=10000000,
                                  max_iter=200,
                                  n_features_out=5,
                                  n_neighbors=7,
                                  random_state=1,
                                  save=None,
                                  tol=1e-05,
                                  use_pca=True,
                                  use_sparse=True,
                                  verbose=1)
lmnn.fit(data, lables)  # doctest: +ELLIPSIS

data = lmnn.transform(data)
wr.write_("train_transformed.txt", lables=lables, qids=qid, datas=data)

test_lables, test_qid, test_data = get_data_splited.get_data(fname="test.txt")

test_data = lmnn.transform(test_data)

wr.write_("test_transformed.txt",
def main():
    hdul = fits.open("deep.fits")
    hdulData = hdul[1].data

    print(hdulData)
    parser = argparse.ArgumentParser(description="This script runs kNN with the required parameters. You should look at those.")
    parser.add_argument("-c", "--catType", nargs=1, required=True, type=int, help="0 for DEEP, 1 for WIDE - Catalogue Type, 2 for complete dataset, including missing values")
    parser.add_argument("-f", "--fillMethod", nargs=1, required=False, type=int, help="0 for replacement of missing values with the column mean. 1 for column median.") 
    parser.add_argument("-t", "--testType", nargs=1, required=True, type=int, help="0 for normal, 1 for sub-field test with train = ELAIS-S1, 2 = sub-field test with train = eCDFS") 
    parser.add_argument("-C", "--colsType", nargs=1, required=True, type=int, help="#0 for radio, sIndex, 3.6, 4.5, 5.8, 8.0, g, r, i, z. 1 for radio, 3.6, 4.5, 5.8, 8.0, g, r, i, z. 2 for radio, 3.6, 4.5, g, r, i, z") 
    parser.add_argument("-d", "--distType", nargs=1, required=True, type=int, help="1 for Manhattan, 2 for Euclidean, 99 for Mahalanobis") 
    parser.add_argument("-b", "--bootstrapSize", nargs=1, required=False, type=int, help="Number of bootstrap intervals. Do not use if you don't want bootstrap") 
    parser.add_argument("-p", "--preBin", nargs=1, required=False, type=bool, help="Should the data be pre-binned? Don't enter for no") 
    parser.add_argument("-P", "--postBin", nargs=1, required=False, type=bool, help="Should the data be post-binned? Don't enter for no") 
    parser.add_argument("-z", "--classification", nargs=1, required=False, type=bool, help="Classification or regression. True for classification Don't enter for no") 
    parser.add_argument("-m", "--metricLearn", nargs=1, required=False, type=bool, help="Should metric learning be used? Don't enter for no")
    parser.add_argument("-M", "--method", nargs=1, required=False, type=int, help="The type of ML to use. Nothing for kNN, 1 Linear Regression, 2 for Random Forest, 3 for lasso regression, 4 for ridge regression") 


    args = vars(parser.parse_args())

    

    #Data, Tests and Columns to use
    catType = args["catType"][0]
    testType = args["testType"][0]
    colsType = args["colsType"][0]
    distType = args["distType"][0]
    FailureLimit = 0.15
    nSplits = 10 #Used in k-Fold Cross Validation. 

    if args["fillMethod"] == None:
        fillMethod = 0
    else:
        fillMethod = args["fillMethod"][0] 
    if args["bootstrapSize"] == None:
        bootstrapSize = False
    else:
        bootstrapSize = args["bootstrapSize"][0] 
    binData = 15
    if args["postBin"] == None:
        postBin = False
    else:
        postBin = args["postBin"][0] 
    if args["preBin"] == None:
        preBin = False
    else:
        preBin = args["preBin"][0]
    if args["classification"] == None:
        classification = False
    else:
        classification = args["classification"][0]
    if args["metricLearn"] == None:
        metricLearn = False
    else:
        metricLearn = args["metricLearn"][0] 
    if args["method"] == None:
        MLMethod = 0
    else:
        MLMethod = args["method"][0] 
    if MLMethod == 2:
        neighboursList = range(2,60) #Using Random Forest. Should be different!
    elif MLMethod == 0:
        neighboursList = range(2,20) 
    else:
        neighboursList = [0] 

    folderpath = "cat-" + str(catType).strip() + "_fillMethod-" + str(fillMethod).strip() + "_testType-" + str(testType).strip() + "_colsType-" + str(colsType).strip() + "_distType-" + str(distType).strip()
    folderpath = folderpath + "_boot-" + str(bootstrapSize).strip() + "_preBin-" + str(preBin).strip() + "_postBin-" + str(postBin).strip() 
    folderpath = folderpath + "_class-" + str(classification).strip() + "_metricLearn-" + str(metricLearn).strip() + "_MLMethod-" + str(MLMethod)

    if not os.path.exists("Results"):
        os.makedirs("Results")
    os.chdir("Results")
    startTime = datetime.now()
    # if not os.path.exists(now.strftime("%d-%m-%Y")):
    #     os.makedirs(now.strftime("%d-%m-%Y"))
    # os.chdir(now.strftime("%d-%m-%Y"))
    if not os.path.exists(folderpath):
        os.makedirs(folderpath) 



    #Individual switches to modify "small" details
    logZ = False #Set to True if z should be log(z)
    #NOTE!!! There is no check on the below redshift modifications to make sure selections make sense!
    maxRedshift = None #Flag to set max redshift to keep
    minRedshift = None #Flag to set min redshift to keep
    np.random.seed(42) 

    useLogRadio = False #Set to true to take the log of radio data. False otherwise
    useColoursOptical = False #Set to True to use Optical Colours instead of Magnitudes
    useIRMagnitudes = False #Set to True to use log(IRFlux)
    useColoursIR = False #Set to True to use IR Colours instead of Fluxes (Implies True to above)
    standardiseXVals = True #Set to True to standardise the X-Values (x_i - x_mean) / x_sd


        
        
    

    if catType == 0:
        # catalogue = "../ATLAS_CATALOGUE/ATLAS-SWIRE-GRC-merged-2017-11-07.fits"
        catalogue = "deep.fits"
        # catalogue = "/bigdata/users/postgrad/kluken/Masters/ATLAS_CATALOGUE/ATLAS_Reduced_NoSindex.fits"
    elif catType == 1:
        catalogue = "wide.fits"
        #catalogue = "/bigdata/users/postgrad/kluken/Masters/ATLAS_CATALOGUE/ATLAS_EMU_3.6_4.5_DES.fits"
    elif catType == 2:
        catalogue = "missing.fits"



    if colsType == 0:
        dataCols = ["z","Sp2","Sindex","flux_ap2_36","flux_ap2_45","flux_ap2_58","flux_ap2_80","MAG_APER_4_G","MAG_APER_4_R","MAG_APER_4_I","MAG_APER_4_Z"]
        dataType = [0,1,2,3,3,3,3,4,4,4,4]
    elif colsType == 1:
        dataCols = ["z","Sp2","flux_ap2_36","flux_ap2_45","flux_ap2_58","flux_ap2_80","MAG_APER_4_G","MAG_APER_4_R","MAG_APER_4_I","MAG_APER_4_Z"]
        dataType = [0,1,3,3,3,3,4,4,4,4]
    elif colsType == 2:
        dataCols = ["z","Sp2","flux_ap2_36","flux_ap2_45","MAG_APER_4_G","MAG_APER_4_R","MAG_APER_4_I","MAG_APER_4_Z"]
        dataType = [0,1,3,3,4,4,4,4]



    #Create instance of preprocessing class to clean data
    preprocess = functions.DataProcessing()


    #Open Fits Catalogue
    print(catalogue)
    print(folderpath)


    # hdul = fits.open("missing.fits")
    # hdulData = hdul[1].data
    # print("Time taken to open fitsFile: " + str(datetime.now() - startTime))
    os.chdir(folderpath)


    if os.path.isfile("resultsPlot.pdf"):
        print(folderpath + " already complete")
        sys.exit()


    #Create catalogueData array from the redshift column
    catalogueData = np.reshape(np.array(hdulData.field(dataCols[0]), dtype=np.float32), [len(hdulData.field(dataCols[0])),1])
    #Add the columns required for the test
    for i in range(1, len(dataCols)):
        catalogueData = np.hstack([catalogueData,np.reshape(np.array(hdulData.field(dataCols[i]), dtype=np.float32), [len(hdulData.field(dataCols[i])),1])])
    fieldList = np.reshape(np.array(hdulData.field("field"), dtype=np.str), [len(hdulData.field("field")),1])
    # print("Time taken to create catalogueData: " + str(datetime.now() - startTime))

    catalogueData1 = pd.DataFrame(catalogueData)
    catalogueData1.to_csv("catalogueData1.csv", index=False)
    #Begin cleaning process
    #Remove items with missing redshifts
    missingRedshifts = np.where(catalogueData[:,0] <= 0)[0]
    cleanCatalogue = np.delete(catalogueData, missingRedshifts, 0)
    fieldList = np.delete(fieldList, missingRedshifts, 0)

    #Make sure values are all within "sane" ranges
    for i in range(1, len(dataCols)):
        cleanCatalogue[:,i] = preprocess.cleanData(cleanCatalogue[:,i], dataType[i], fillMethod)

    cleanCatalogue1 = pd.DataFrame(cleanCatalogue)
    cleanCatalogue1.to_csv("cleanCatalogue1.csv", index=False)


    #print("updated Categologe", cleanCatalogue.shape)
    #Removing min and max redshifts if set
    if minRedshift != None:
        killRedshift = np.where(cleanCatalogue[:,0] < minRedshift)[0]
        cleanCatalogue = np.delete(cleanCatalogue, killRedshift, 0)
    if maxRedshift != None:
        killRedshift = np.where(cleanCatalogue[:,0] > maxRedshift)[0]
        cleanCatalogue = np.delete(cleanCatalogue, killRedshift, 0)
    
    if postBin and not classification:
        temp = cleanCatalogue[:,0]
        temp, binEdges, binnedZ = binDataFunc(temp, binData)
    
    #Bin z values
    if preBin or classification:
        cleanCatalogue[:,0], binEdges, binnedZ = binDataFunc(cleanCatalogue[:,0], binData)
    
    #Take log(z)
    if logZ:
        cleanCatalogue[:,0] = np.log(cleanCatalogue[:,0])
    
    #Use log(Radio)
    if useLogRadio:
        cleanCatalogue[:,1] = np.log(cleanCatalogue[:,1])

    #Use Optical Colours
    if useColoursOptical:
        for i in range(-4, -2):
            cleanCatalogue[:,i] = cleanCatalogue[:,i] - cleanCatalogue[:,i+1]
        cleanCatalogue = cleanCatalogue[:,0:-1]

    #Use IR Colours. Each dataType has different column numbers for IR, hence need different solution to each.
    #Need to take log(IR Flux) to get them into "Magnitudes", which can then be used to calculate the difference
    #between each - "Colours". Given we lose a column of data going to colours, delete the last column.
    #Set useMagnitudes to False so we don't then take a log of a ratio of a log.
    if useColoursIR:
        if colsType == 0:
            for i in range(4,6):
                cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i])
            for i in range(4,5):
                cleanCatalogue[:,i] = cleanCatalogue[:,i] - cleanCatalogue[:,i+1]
            cleanCatalogue = np.delete(cleanCatalogue, obj = 6, axis = 1)
        elif colsType == 1:
            for i in range(3,5):
                cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i])
            for i in range(3,4):
                cleanCatalogue[:,i] = cleanCatalogue[:,i] - cleanCatalogue[:,i+1]
            cleanCatalogue = np.delete(cleanCatalogue, obj = 5, axis = 1)
        elif colsType == 2:
            for i in range(3,4):
                cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i])
            cleanCatalogue[:,2] = cleanCatalogue[:,2] - cleanCatalogue[:,3]
            cleanCatalogue = np.delete(cleanCatalogue, obj = 3, axis = 1)
        useIRMagnitudes = False

    #Take the log(IR Flux) to get the IR "Magnitudes"
    if useIRMagnitudes:
        if colsType == 0:
            for i in range(4,6):
                cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i])
        elif colsType == 1:
            for i in range(3,5):
                cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i])
        elif colsType == 2:
            for i in range(3,4):
                cleanCatalogue[:,i] = np.log(cleanCatalogue[:,i])


    cleanCatalogue1 = pd.DataFrame(cleanCatalogue)
    cleanCatalogue1.to_csv("cleanCatalogue1.csv", index=False)
    print("cleaning done")

    #Standardising all xVals - (x_i - x_mean) / x_sd
    if standardiseXVals:
        for i in range(1, cleanCatalogue.shape[1]):
            cleanCatalogue[:,i] = (cleanCatalogue[:,i] - np.mean(cleanCatalogue[:,i])) / np.std(cleanCatalogue[:,i])

    # print("Time taken to clean and pre-process cleanCatalogue: " + str(datetime.now() - startTime))


    y_vals = cleanCatalogue[:,[0]]
    x_vals = cleanCatalogue[:,1:]
    num_features = x_vals.shape[1]
    predictionBootstrap = []
    mseBootstrap = []
    outlierBootstrap = []
    
    # Split the data into train and test sets
    if testType == 0: 
        #Withdraw our 30% test set
        np.random.seed(225)
        test_indices = np.random.choice(len(x_vals), round(len(x_vals)*0.3), replace=False)
        train_indices = np.array(list(set(range(len(x_vals))) - set(test_indices)))
        x_vals_train = x_vals[train_indices]
        x_vals_test = x_vals[test_indices]
        y_vals_train = y_vals[train_indices]
        y_vals_test = y_vals[test_indices]

    elif testType == 1:
        #Withdraw our test set
        x_vals_test = x_vals[np.where(fieldList == "CDFS    ")[0]]
        y_vals_test = y_vals[np.where(fieldList == "CDFS    ")[0]]
        #Find the training set
        x_vals_train = x_vals[np.where(fieldList == "ELAIS-S1")[0]]
        y_vals_train = y_vals[np.where(fieldList == "ELAIS-S1")[0]]
        
        
    elif testType == 2:
        #Withdraw our test set
        y_vals_test = y_vals[np.where(fieldList == "ELAIS-S1")[0]]
        x_vals_test = x_vals[np.where(fieldList == "ELAIS-S1")[0]]
        #Find the training set
        x_vals_train = x_vals[np.where(fieldList == "CDFS    ")[0]]
        y_vals_train = y_vals[np.where(fieldList == "CDFS    ")[0]]

    print("start")
    x_vals_train_df = pd.DataFrame(x_vals_train)
    y_vals_train_df = pd.DataFrame(y_vals_train)
    vals_train_df = pd.concat([y_vals_train_df, x_vals_train_df], ignore_index=True, axis=1)
    vals_train_df.to_csv('vals_train_df_test_type{dbname}.csv'.format(dbname=testType), index=False)


    print("train size", vals_train_df.shape)
    x_vals_test_df = pd.DataFrame(x_vals_test)
    y_vals_test_df = pd.DataFrame(y_vals_test)
    vals_test_df = pd.concat([y_vals_test_df, x_vals_test_df], ignore_index=True, axis=1)
    vals_test_df.to_csv('vals_test_df_test_type{dbname}.csv'.format(dbname=testType), index=False)
    print("test size", vals_test_df.shape)

    method_no = 6

    if method_no == 1:
        imputed_file_name = "KNN_imputated_catalogueData1.csv"
    elif method_no == 2:
        imputed_file_name = "GAN_imputated_catalogueData1.csv"
    elif method_no == 3:
        imputed_file_name = "Mean_imputated_catalogueData1.csv"
    elif method_no == 4:
        imputed_file_name = "Median_imputated_catalogueData1.csv"
    elif method_no == 5:
        imputed_file_name = "MICE_imputated_catalogueData1.csv"
    else:
        imputed_file_name = "None"

    if imputed_file_name != "None":
        if not os.path.isfile(imputed_file_name):
            sourceFolderPath = "Source File Path/Results/"
            destFolderPath = sourceFolderPath + folderpath + "/"
            shutil.move(os.path.join(sourceFolderPath, imputed_file_name), destFolderPath)

        x_vals_test = np.loadtxt(imputed_file_name, delimiter=",", usecols=(range(1, 10)), skiprows=1)
        y_vals_test = np.loadtxt(imputed_file_name, delimiter=",", usecols=(0), skiprows=1)
        y_vals_test = y_vals_test.reshape(-1)
        print("x size is:", x_vals_test.shape)
        print("y size is: ", y_vals_test.shape)
        print("done")

    if not preBin and metricLearn:
        metricLearnModel = metricLearnRegression(x_vals_train, y_vals_train)
        x_vals_train = metricLearnModel.transform(x_vals_train)
        x_vals_test = metricLearnModel.transform(x_vals_test)


    if type(bootstrapSize) == int:
        predictionBootstrap = []
        mseBootstrap = []
        outlierBootstrap = []

        # for i in tqdm(range(bootstrapSize)):
        for i in range(bootstrapSize):
            # Use metric learning if required
            if metricLearn and not classification:
                B = metricLearnRegression(x_vals_train, y_vals_train)
                x_vals_train = B.transform(x_vals_train)
                x_vals_test = B.transform(x_vals_test)

            # Split the data into train and test sets
            # Randomly sample our training set for bootstrapping
            train_indices = np.random.choice(len(y_vals_train), len(y_vals_train), replace=True)
            x_vals_train_bootstrap = x_vals_train[train_indices]
            y_vals_train_bootstrap = y_vals_train[train_indices]

            kFold = KFold(n_splits=nSplits, random_state=10, shuffle=True)
            MSE = []
            Failed = []

            # for numNeighbours in tqdm(neighboursList):
            for numNeighbours in neighboursList:
                mseList = []
                failed = []

                # for trainIndex, testIndex in tqdm(kFold.split(x_vals_train_bootstrap), total=nSplits):
                for trainIndex, testIndex in kFold.split(x_vals_train_bootstrap):
                    x_vals_train_cross = x_vals_train_bootstrap[trainIndex]
                    x_vals_test_cross = x_vals_train_bootstrap[testIndex]
                    y_vals_train_cross = y_vals_train_bootstrap[trainIndex]
                    y_vals_test_cross = y_vals_train_bootstrap[testIndex]

                    if MLMethod == 0:
                        pred, mseTest = kNN(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross, distType)
                    elif MLMethod == 1:
                        pred, mseTest = linRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross)
                    elif MLMethod == 2:
                        pred, mseTest = randomForestRegress(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross)
                    elif MLMethod == 3:
                        pred, mseTest = lassoRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross)
                    elif MLMethod == 4:
                        pred, mseTest = ridgeRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross)

                    lengthOfSplit = len(pred)
                    if logZ:
                        error = np.abs(np.exp(pred) - np.exp(y_vals_test_cross))
                        failed.append(len(error[np.where(error > (FailureLimit * (1+np.exp(y_vals_test_cross))))[0]])/lengthOfSplit )
                    else:
                        error = np.abs(pred - y_vals_test_cross)
                        failed.append(len(error[np.where(error > (FailureLimit * (1+y_vals_test_cross)))[0]])/lengthOfSplit )

                    mseList.append(np.round(mseTest,3))

                MSE.append(np.mean(mseList))
                Failed.append(np.mean(failed))
            
            mseBootstrap.append(MSE)
            outlierBootstrap.append(Failed)

            bestKIndex = (np.argmin(np.array(Failed)))
            bestK = neighboursList[bestKIndex]

            if MLMethod == 0:
                pred, mse_test = kNN(numNeighbours, x_vals_train_bootstrap, x_vals_test, y_vals_train_bootstrap, y_vals_test, distType)
            elif MLMethod == 1:
                pred, mse_test = linRegress(x_vals_train_bootstrap, x_vals_test, y_vals_train_bootstrap, y_vals_test)
            elif MLMethod == 2:
                pred, mse_test = randomForestRegress(numNeighbours, x_vals_train_bootstrap, x_vals_test, y_vals_train_bootstrap, y_vals_test)
            elif MLMethod == 3:
                pred, mse_test = lassoRegress(x_vals_train_bootstrap, x_vals_test, y_vals_train_bootstrap, y_vals_test)
            elif MLMethod == 4:
                pred, mse_test = ridgeRegress(x_vals_train_bootstrap, x_vals_test, y_vals_train_bootstrap, y_vals_test)
            
            if logZ:
                error = np.abs(np.exp(pred) - np.exp(y_vals_test))
                testError = (len(error[np.where(error > (FailureLimit*(1+np.exp(y_vals_test))))[0]])/len(pred) )
            else:
                error = np.abs(pred - y_vals_test)
                testError = (len(error[np.where(error > (FailureLimit*(1+y_vals_test)))[0]])/len(pred) )


            if logZ:
                predictionBootstrap.append(np.exp(pred))
            else:
                predictionBootstrap.append(pred)


    

    outlier_final = []
    mse_final = []
    kFold = KFold(n_splits=nSplits, random_state=10, shuffle=True)
    # for numNeighbours in tqdm(neighboursList):
    for numNeighbours in neighboursList:
        mseList = []
        failed = []
        # TODO: Need to turn this back on when regression metric learn is working.
        if metricLearn and preBin: # and classification:
            lmnn = LMNN(n_neighbors=numNeighbours, max_iter=200, n_features_out=x_vals_train.shape[1], verbose=None)
        
        # for trainIndex, testIndex in tqdm(kFold.split(x_vals_train), total=nSplits):
        for trainIndex, testIndex in kFold.split(x_vals_train):
            #Define training and test sets
            x_vals_train_cross = x_vals_train[trainIndex]
            x_vals_test_cross = x_vals_train[testIndex]
            y_vals_train_cross = y_vals_train[trainIndex]
            y_vals_test_cross = y_vals_train[testIndex]

            # TODO: Need to turn this back on when regression metric learn is working.
            if metricLearn and preBin: # and classification:
                lmnn.fit(x_vals_train_cross, np.squeeze(y_vals_train_cross.astype(str)))
                x_vals_train_cross = lmnn.transform(x_vals_train_cross)
                x_vals_test_cross = lmnn.transform(x_vals_test_cross)
            
            # Use metric learning if required
            if metricLearn and not classification:
                B = metricLearnRegression(x_vals_train_cross, y_vals_train_cross)
                x_vals_train_cross = B.transform(x_vals_train_cross)
                x_vals_test_cross = B.transform(x_vals_test_cross)

            if classification:
                if MLMethod == 0:
                    pred, mseTest = kNN_classification(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross, distType)
                elif MLMethod == 1:
                    pred, mseTest = logRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross)
                elif MLMethod == 2:
                    pred, mseTest = randomForestClass(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross)
            else:
                if MLMethod == 0:
                    pred, mseTest = kNN(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross, distType)
                elif MLMethod == 1:
                    pred, mseTest = linRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross)
                elif MLMethod == 2:
                    pred, mseTest = randomForestRegress(numNeighbours, x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross)
                elif MLMethod == 3:
                    pred, mseTest = lassoRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross)
                elif MLMethod == 4:
                    pred, mseTest = ridgeRegress(x_vals_train_cross, x_vals_test_cross, y_vals_train_cross, y_vals_test_cross)
                
            lengthOfSplit = len(pred)
            if logZ:
                error = np.abs(np.exp(pred) - np.exp(np.squeeze(y_vals_test_cross)))
                failed.append(len(error[np.where(error > (FailureLimit * (1+np.exp(np.squeeze(y_vals_test_cross)))))[0]])/lengthOfSplit )
            else:
                error = np.abs(pred - np.squeeze(y_vals_test_cross))
                failed.append(len(error[np.where(error > (FailureLimit * (1+np.squeeze(y_vals_test_cross))))[0]])/lengthOfSplit )
        
            mseList.append(np.round(mseTest,3))

        mse_final.append(np.mean(mseList))
        outlier_final.append(np.mean(failed))

    bestKIndex = (np.argmin(np.array(outlier_final)))
    bestK = neighboursList[bestKIndex]

    if classification:
        if metricLearn:
            lmnn = LMNN(n_neighbors=bestK, max_iter=200, n_features_out=x_vals_train.shape[1], verbose=None)
            lmnn.fit(x_vals_train, np.squeeze(y_vals_train.astype(str)))
            x_vals_train = lmnn.transform(x_vals_train)
            x_vals_test = lmnn.transform(x_vals_test)
            
        if MLMethod == 0:
            finalPrediction, finalMSE = kNN_classification(bestK, x_vals_train, x_vals_test, y_vals_train, y_vals_test, distType)
        if MLMethod == 1:
            finalPrediction, finalMSE = logRegress(x_vals_train, x_vals_test, y_vals_train, y_vals_test)
        if MLMethod == 2:
            finalPrediction, finalMSE = randomForestClass(bestK, x_vals_train, x_vals_test, y_vals_train, y_vals_test)

        
    else:
        # TODO: Need to remove/change this once regression metric learning is done.
        if metricLearn and preBin: 
            lmnn = LMNN(n_neighbors=bestK, max_iter=200, n_features_out=x_vals_train.shape[1], verbose=None)
            lmnn.fit(x_vals_train, np.squeeze(y_vals_train.astype(str)))
            x_vals_train = lmnn.transform(x_vals_train)
            x_vals_test = lmnn.transform(x_vals_test)

        if MLMethod == 0:
            finalPrediction, finalMSE = kNN(bestK, x_vals_train, x_vals_test, y_vals_train, y_vals_test, distType)
        elif MLMethod == 1:
            finalPrediction, finalMSE = linRegress(x_vals_train, x_vals_test, y_vals_train, y_vals_test)
        elif MLMethod == 2:
            finalPrediction, finalMSE = randomForestRegress(bestK, x_vals_train, x_vals_test, y_vals_train, y_vals_test)
        elif MLMethod == 3:
            finalPrediction, finalMSE = lassoRegress(x_vals_train, x_vals_test, y_vals_train, y_vals_test)
        elif MLMethod == 4:
            finalPrediction, finalMSE = ridgeRegress(x_vals_train, x_vals_test, y_vals_train, y_vals_test)

    residuals = (np.squeeze(y_vals_test) - finalPrediction) / (1 + np.squeeze(y_vals_test))

    if postBin:
        finalPrediction, temp, temp2 = binDataFunc(finalPrediction, binData, binEdges = binEdges, newZ = binnedZ)
        y_vals_test, temp, temp2 = binDataFunc(y_vals_test, binData, binEdges = binEdges, newZ = binnedZ)
        
    if postBin or classification:
        confusion = confusion_matrix(np.round(y_vals_test,2).astype(str),np.round(finalPrediction,2).astype(str))#.astype(float)        
        #plotNormConfusionMatrix(confusion,binnedZ,binEdges)
        plotScaledConfusionMatrix(y_vals_test, finalPrediction, binEdges, binnedZ)
        mutualInfo = adjusted_mutual_info_score(np.squeeze(y_vals_test).astype(str),np.squeeze(finalPrediction).astype(str))



    if logZ:
        error = np.abs(np.exp(finalPrediction) - np.exp(np.squeeze(y_vals_test)))
        testError = (len(error[np.where(error > (FailureLimit*(1+np.exp(np.squeeze(y_vals_test)))))[0]])/len(finalPrediction) )
    else:
        error = np.abs(finalPrediction - np.squeeze(y_vals_test))
        testError = (len(error[np.where(error > (FailureLimit*(1+np.squeeze(y_vals_test))))[0]])/len(finalPrediction) )




    if type(bootstrapSize) == int:
        percentiles = np.percentile(predictionBootstrap, q=[2.5,97.5], axis=0)
        percentiles[0,:] = np.abs(finalPrediction -  percentiles[0,:])
        percentiles[1,:] = np.abs(percentiles[1,:] - finalPrediction)



    if classification or postBin:
        precision = metrics.precision_score(y_vals_test.ravel().astype(str), finalPrediction.ravel().astype(str), average="macro")
        recall = metrics.recall_score(y_vals_test.ravel().astype(str), finalPrediction.ravel().astype(str), average="macro")
        f1 = metrics.f1_score(y_vals_test.ravel().astype(str), finalPrediction.ravel().astype(str), average="macro")
    else:
        mse = metrics.mean_squared_error(y_vals_test.ravel(), finalPrediction.ravel())

    predFile = "finalPredictions"
    yValsFile = "yValsFile"
    mseFile = "mseFile"
    outlierFile = "outlierFile"
    binEdgesFile = "binEdges"

    with open(predFile, "wb") as openFile:
        pickle.dump(finalPrediction, openFile)

    with open(yValsFile, "wb") as openFile:
        pickle.dump(y_vals_test, openFile)

    with open(mseFile, "wb") as openFile:
        pickle.dump(mse_final, openFile)

    with open(outlierFile, "wb") as openFile:
        pickle.dump(outlier_final, openFile)

    if postBin or classification:
        with open(binEdgesFile, "wb") as openFile:
            pickle.dump(binEdges, openFile)
    
    outlierRate = 100*len(residuals[np.where(abs(residuals)>0.15)])/len(residuals)

    with open("results.csv", "w") as openFile:
        if postBin or classification:
            openFile.write("bestK,numTrainSources,numTestSources,outlier,score,mutualInfo,residual_std_dev,precision,recall,f1,time\n")
            openFile.write(str(bestK) + "," + str(y_vals_train.shape[0]) + "," + str(y_vals_test.shape[0]) + "," + str(outlierRate) + "," + str(finalMSE) + "," + str(mutualInfo) + "," + str(np.std(residuals)) + "," + str(precision) + "," + str(recall) + "," + str(f1) + "," + str(datetime.now() - startTime))
        else:
            openFile.write("bestK,numTrainSources,numTestSources,outlier,score,residual_std_dev,mse,time\n")
            openFile.write(str(bestK) + "," + str(y_vals_train.shape[0]) + "," + str(y_vals_test.shape[0]) + "," + str(outlierRate) + "," + str(finalMSE) + "," + str(np.std(residuals)) + "," + str(mse) + "," + str(datetime.now() - startTime))

    #Find number of test sources to use in the plot titles

    if MLMethod != 3 and MLMethod != 4 and MLMethod != 1:
        plt.figure(0)
        if classification:
            plt.plot(neighboursList, np.array(mse_final), color="springgreen", label="Accuracy")
        else:
            plt.plot(neighboursList, np.array(mse_final), color="springgreen", label=r'R$^2$')
        plt.plot(neighboursList, np.array(outlier_final), color="deepskyblue", label="Failure Rate")
        plt.ylabel("Error Metric")
        if MLMethod == 0:
            plt.xlabel('Number of Neighbours')
        elif MLMethod == 2:
            plt.xlabel("Number of Trees")
        plt.axvline(bestK,color="red", alpha=0.5)
        plt.legend()
        plt.tight_layout()
        plt.grid()
        plt.savefig("cross_validation.pdf")



    if logZ:
        y_vals_test = np.exp(y_vals_test)



    if type(bootstrapSize) == bool:
        plotData(np.squeeze(y_vals_test), finalPrediction, plt, stats, pylab)
    else:
        plotData(np.squeeze(y_vals_test), finalPrediction, plt, stats, pylab, percentiles)


    plt.savefig("resultsPlot.pdf")
예제 #3
0
                                                    test_size=0.33,
                                                    random_state=19)
covX = np.cov(X_train, rowvar=False)

h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

print('done')

for weights in ['uniform']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = LMNN(n_neighbors=n_neighbors,
               max_iter=150,
               n_features_out=X.shape[1])
    clf.fit(X_train, y_train)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
    y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
예제 #4
0
X = [[0, 3], [1, 2], [2, 4], [3, 1.5]]
y = [0, 0, 1, 1]
from pylmnn.lmnn import LargeMarginNearestNeighbor
# lmnn = LargeMarginNearestNeighbor(n_neighbors=1)
lmnn = LargeMarginNearestNeighbor(L=None,
                                  load=None,
                                  max_constr=10000000,
                                  max_iter=200,
                                  n_features_out=1,
                                  n_neighbors=1,
                                  random_state=None,
                                  save=None,
                                  tol=1e-05,
                                  use_pca=True,
                                  use_sparse=True,
                                  verbose=1)
lmnn.fit(X, y)  # doctest: +ELLIPSIS
print(lmnn.transform(X))

test = [[1.6, 1.6]]
print(lmnn.predict(test))

print(lmnn.transform(test))
예제 #5
0
def find_topic(speechList):
    speechList = speechList[-28:]  #from bush senior

    documents = []
    for speech in speechList:
        tmp = []
        for sent in speech[
                'text_lem']:  #sent is a single sentence which is a list of words
            ss = []
            for w in sent:  #w is a single word
                w = re.compile('[%s]' % re.escape(string.punctuation)).sub(
                    '', w)  #replace punctuations in a word by ''
                if len(w) > 2:
                    try:
                        float(w)  #do not consider numbers
                    except:
                        ss += [w]
            tmp += [
                ' '.join(ss)
            ]  #tmp is a list of sentences. sentence is a string (space separated words). tmp is a collection of sentences from a speech
        documents += [tmp]
    documents_sents = sum(documents, [])  #list of sentences of all speeches

    print(len(documents_sents))

    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=2000,
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(documents_sents)
    tf_feature_names = tf_vectorizer.get_feature_names()

    num_topics = 10
    lda = LatentDirichletAllocation(n_topics=num_topics,
                                    max_iter=10,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0).fit(tf)

    possible_topics = [
        'education', 'jobs', 'world affairs', 'health care', 'middle east',
        'terrorism', 'taxation', 'social programs', 'law and order', 'iraq war'
    ]  #from later analysis

    display_topics(lda, tf_feature_names, 15, possible_topics)

    #now that we have the topic model, let us see, how each speech fares:
    topwords = []
    for topic_idx, topic in enumerate(lda.components_):
        #topic is a np array of 2000 (num features) numbers
        tmp = topic.argsort()[:-15 - 1:-1]
        topwords.append({tf_feature_names[i]: topic[i] for i in tmp})
    #topwords is of length #topics. topwords[i] is the top words (and scores) for topic i

    all_doc_fts = []
    for docidx, doc in enumerate(documents):
        doc_ft = []
        for sentidx, sent in enumerate(doc):
            words = sent.split(' ')
            topic_ft = [0] * len(topwords)  #length 10
            for word in words:
                for tp_idx, tp in enumerate(topwords):
                    if word in tp:
                        topic_ft[tp_idx] += 1
            doc_ft += [topic_ft
                       ]  #doc_ft is num_of_sentences_in_doc x num_topics
        all_doc_fts += [doc_ft]

    finaldocft = np.array([
        np.mean(np.array(all_doc_ft), 0) for all_doc_ft in all_doc_fts
    ])  #shape: 28 x 10 (num docs x num topics)

    speechinfo = [(speech['speaker'], speech['party'])
                  for speech in speechList]
    top_speeches_using_topic = []
    #who used topic i the most?
    for topicidx in range(len(topwords)):
        tmp = np.argsort(finaldocft[:, topicidx])[::-1][:5]
        top_speeches_using_topic.append(
            tmp)  #top 3 speeches that use this topic
        tmp1 = []
        for t in tmp:
            if speechinfo[t] not in tmp1:
                tmp1 += [speechinfo[t]]
        print('Topic ' + str(topicidx) + ': ' + possible_topics[topicidx])
        #tmp1 = (set([speechinfo[t] for t in tmp]))
        print([i[0] + ' (' + i[1] + ')' for i in tmp1])

    #each speech used which topics (top 3 topics per speech)?
    for idx, (speaker, party) in enumerate(speechinfo):
        tmp = np.argsort(finaldocft[idx, :])[::-1][:3]
        tp = ' '.join(
            ['Topic ' + str(i) + ' (' + possible_topics[i] + ')' for i in tmp])
        print(speaker + ' used the following topics: ' + tp)

    #pdb.set_trace()
    ftmap = 0.1 * np.ones([finaldocft.shape[0], finaldocft.shape[0]])  #28 x 28
    for idx1 in range(finaldocft.shape[0]):
        for idx2 in range(finaldocft.shape[0]):
            if idx1 != idx2:
                ftmap[idx1, idx2] = np.linalg.norm(finaldocft[idx1, :] -
                                                   finaldocft[idx2, :])
    sns.heatmap(np.log(ftmap))
    plt.savefig('heatmap_topic.png')
    plt.close()

    #Now do metric-learning
    k_tr, dim_out, max_iter = 3, finaldocft.shape[1], 180
    clf = LMNN(n_neighbors=k_tr,
               max_iter=max_iter,
               n_features_out=dim_out,
               verbose=False)
    class_labels = [0] * 3 + [1] * 8 + [2] * 9 + [3] * 8
    clf = clf.fit(finaldocft, class_labels)
    #accuracy_lmnn = clf.score(finaldocft, class_labels)
    #print ('Metric learn accuracy: ', accuracy_lmnn)

    ftmap = 0.1 * np.ones([finaldocft.shape[0], finaldocft.shape[0]])  #28 x 28
    for idx1 in range(finaldocft.shape[0]):
        for idx2 in range(finaldocft.shape[0]):
            if idx1 != idx2:
                ftmap[idx1, idx2] = np.linalg.norm(
                    clf.transform([finaldocft[idx1, :]]) -
                    clf.transform([finaldocft[idx2, :]]))
    sns.heatmap(np.log(ftmap))
    plt.savefig('heatmap_topic_metric.png')
    plt.close()
예제 #6
0
def draw_heatmap(speechList):
    vocab = set([])
    for idx, speech in enumerate(speechList):
        words_in_speech = sum(speech['text_lem'], [])
        words_in_speech_filt = filter(words_in_speech)
        vocab = vocab.union(set(words_in_speech_filt))
    #vocab is a list of all unique words in all the speeches
    pdflist = {}
    try:
        pdflist = pkl.load(open('unigram_pdf.pkl', 'rb'))
    except:
        for idx, speech in enumerate(speechList):
            year = int(speech['date'].split(' ')[-1])
            print (year,'xx')
            pdflist[year] = [speech['speaker'], get_pdf(filter(sum(speech['text_lem'],[])), vocab)]
        pkl.dump(pdflist, open('unigram_pdf.pkl', 'wb'))
        
    try:
        heatmapvals = pkl.load(open('heatmap.pkl', 'rb'))
    except:
        heatmapvals = np.zeros([len(range(1901, 2017)), len(range(1901, 2017))])
        for year1 in range(1901, 2017):
            print(year1)
            for year2 in range(1901, 2017):
                #pdb.set_trace()
                try:
                    kl1, terms1 = kldiv(pdflist[year1][-1], pdflist[year2][-1])
                    kl2, terms2 = kldiv(pdflist[year2][-1], pdflist[year1][-1])
                    heatmapvals[year1-1901][year2-1901] = 0.5*(kl1+kl2)
                except:
                    continue
        pkl.dump(heatmapvals, open('heatmap.pkl', 'wb'))
        
    #create a (chronologically ordered) list of presidents, and their start years
    presis = []; presi_start_year = {}
    for year in range(1901, 2017):
        try:
            if pdflist[year][0] not in presis:
                presis += [pdflist[year][0]]
                presi_start_year[pdflist[year][0]] = year
        except:
            continue
    bigmap = helper(presis, heatmapvals, presi_start_year)  #of size num_presis x num_presis

    np.savetxt('heatmap.csv', heatmapvals, delimiter=',')       
    np.savetxt('heatmapbig.csv', bigmap, delimiter=',')  
    #pdb.set_trace()
    #sns.heatmap(heatmapvals)
    sns.heatmap(bigmap)
    plt.savefig('heatmapbig.png')
    plt.close()
    sns.heatmap(heatmapvals[-28:, -28:]) #bush senior to obama only
    plt.savefig('heatmapzoom.png')
    plt.close()
    #pdb.set_trace()
    
    
    
    
    
    
    from sklearn.decomposition import PCA
    
    keys = pdflist[2001][1].keys()
    unigramft = np.array([[pdflist[yr][1][k] for k in keys] for yr in range(1989, 2017)])
    #pdb.set_trace()
    pca = PCA(n_components=10)
    print ('start PCA')
    pca.fit(unigramft)
    print ('fitted PCA')
    newfts = pca.transform(unigramft)
    print ('transformed PCA')
    #pdb.set_trace()
    
    try:
        heatmapvals_zoom_pca = pkl.load(open('heatmap_zoom_pca.pkl', 'rb'))
    except:
        heatmapvals_zoom_pca = np.zeros([len(range(1989, 2017)), len(range(1989, 2017))])
        for year1 in range(1989, 2017):
            print(year1)
            for year2 in range(1989, 2017):
                #pdb.set_trace()
                try:
                    #pdb.set_trace()
                    f1 = newfts[year1-1989,:]
                    f2 = newfts[year2-1989,:]
                    heatmapvals_zoom_pca[year1-1989][year2-1989] = np.linalg.norm(f1-f2)
                except:
                    continue
        pkl.dump(heatmapvals_zoom_pca, open('heatmap_zoom_pca.pkl', 'wb'))
        
    sns.heatmap(heatmapvals_zoom_pca) #bush senior to obama only
    plt.savefig('heatmap_zoom_pca.png')
    plt.close()
    
    k_tr, dim_out, max_iter = 3, newfts.shape[1], 180
    clf = LMNN(n_neighbors=k_tr, max_iter=max_iter, n_features_out=dim_out, verbose=False)
    class_labels = [0]*3 + [1]*8 + [2]*9 + [3]*8
    clf = clf.fit(newfts, class_labels)
    
    ftmap = 0.1*np.ones([newfts.shape[0],newfts.shape[0]]) #28 x 28
    for idx1 in range(newfts.shape[0]):
        for idx2 in range(newfts.shape[0]):
            if idx1!=idx2:
                ftmap[idx1,idx2] = np.linalg.norm(clf.transform([newfts[idx1,:]]) - clf.transform([newfts[idx2,:]]))
    sns.heatmap(np.log(ftmap))
    plt.savefig('heatmap_pca_metric.png')
    plt.close()
예제 #7
0
    # Mahalanobis k = 1

    clf = neighbors.KNeighborsClassifier(n_neighbors,
                                         weights=weights,
                                         metric='mahalanobis',
                                         metric_params={'V': covX})
    clf.fit(X_train, y_train)

    acc = clf.score(X_test, y_test)
    print(acc)
    mahanalobisresults.append(acc)

    # lmnn
    clf = LMNN(n_neighbors=n_neighbors,
               max_iter=150,
               n_features_out=X.shape[1])
    clf.fit(X_train, y_train)

    acc = clf.score(X_test, y_test)
    print(acc)
    lmnnresults.append(acc)

print("Euclidean k=1 std:", np.std(euclidean1results), " mean: ",
      np.mean(euclidean1results))
print("Euclidean k=3 std:", np.std(euclidean3results), " mean: ",
      np.mean(euclidean3results))
print("Mahanalobis k=1 std:", np.std(mahanalobisresults), " mean: ",
      np.mean(mahanalobisresults))
print("LMNN k=1 std:", np.std(lmnnresults), " mean: ", np.mean(lmnnresults))
예제 #8
0
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

from pylmnn.lmnn import LargeMarginNearestNeighbor as LMNN
from pylmnn.plots import plot_comparison


# Load a data set
dataset = load_iris()
X, y = dataset.data, dataset.target

# Split in training and testing set
x_tr, x_te, y_tr, y_te = train_test_split(X, y, test_size=0.7, stratify=y, random_state=42)

# Set up the hyperparameters
k_tr, k_te, dim_out, max_iter = 3, 1, X.shape[1], 180

# Instantiate the classifier
clf = LMNN(n_neighbors=k_tr, max_iter=max_iter, n_features_out=dim_out)

# Train the classifier
clf = clf.fit(x_tr, y_tr)

# Compute the k-nearest neighbor test accuracy after applying the learned transformation
accuracy_lmnn = clf.score(x_te, y_te)
print('LMNN accuracy on test set of {} points: {:.4f}'.format(x_te.shape[0], accuracy_lmnn))

# Draw a comparison plot of the test data before and after applying the learned transformation
plot_comparison(clf.L_, x_te, y_te, dim_pref=3)