示例#1
0
bmi = values.index(max(values))  # Best Model Index

q2, NV, NVL_Model, NVL_OPS = float(results[bmi][0]), int(results[bmi][1]), int(
    results[bmi][2]), int(results[bmi][3])
if len(sys.argv) > 5:
    start = 4
else:
    infoVec = int(results[bmi][4])
    start = 5

sel = []
for i in range(start, len(results[bmi])):
    sel.append(int(results[bmi][i]))
sel = np.array(sel)

q2, r2, rmsecv, rmse, corrcv, corrmdl = cv.plsLOO(X[:, sel], Y, int(NVL_Model))

if len(sys.argv) > 5:
    output = np.array([
        q2, r2, rmsecv, rmse, corrcv, corrmdl,
        int(NV),
        int(NVL_Model),
        int(NVL_OPS)
    ])
    print('Best model found:', 'Q2:', '{0:.4f}'.format(q2), 'NV:', int(NV),
          'NVL_Model:', int(NVL_Model), 'NVL_OPS:', int(NVL_OPS))
else:
    output = np.array([
        q2, r2, rmsecv, rmse, corrcv, corrmdl,
        int(NV),
        int(NVL_Model),
示例#2
0
def OPS(X, Y, maxVL_OPS, maxVL_Model, maxVariables, infoVec='prod', verbose=0):
    """
    Ordered Predictors Selection (OPS): feature selection algorithm for multivariate regression.
    For details about OPS, please see reference [1].

    Parameters
    ----------
    X : numpy array of shape [n_samples,n_features]
        Samples.
    Y : numpy array of shape [n_samples]
        Target values.
    maxVL_OPS: integer
        Maximum number of latent variables tested in OPS for sorting varables (when using 'prod' or 'reg' infoVec).
    maxVL_Model: integer
        Maximum number of latent variables tested in PLS models with each feature subset.
    maxVariables: integer
        Maximum number of features tested.
    infoVec : {'prod', 'reg', or 'corr'}
        The informative vector used to sort variables. 
        'corr' is the correlation vector between features and target values, 
        'reg' is the regression vector obtained with a PLS model, and
        'prod' is the combination of correlation vector and regression vector (usually is best)
    verbose : {0,1}
        0 Verbosity mode off.
        1 Verbosity mode on.


    Returns
    -------
    subset : integer list
        Index columns of selected features.
    q2 : float
        Q^2 external validation metric. For details see reference [2].
    NV : int
        Number of selected features.
    NVL_Model: float
        Number of PLS latent variables.
    NVL_OPS : int
        Number of OPS latent variables used for sorting variables.
       
    
    References
    ----------
        [1] Teófilo, R. F.; Martins, J. P. A. & Ferreira, M. M. C. (2009).
        Sorting variables by using informative vectors as a strategy for feature selection in multivariate regression. 
        Journal of Chemometrics, 23(1):32--48. ISSN 0886-9383

        [2] Martins, J. P. A.; Barbosa, E. G.; Pasqualoto, K. F. M. & Ferreira, M. M. C. (2009).
        LQTA-QSAR: a new 4D-QSAR methodology. 
        Journal of Chemical Information and Modeling, 49(6):1428--1436. ISSN 1549-9596
    """
    nSamples, nFeatures = X.shape
    if maxVariables > nFeatures:
        maxVariables = nFeatures

    # Best models are saved in this matrix, each line is formatted as
    # [Q^2, NumVariables, numLatentVariables (Model), NumLatentVariables (OPS), Key of features subset]
    results = np.empty([0, 5])

    selFeats = {}
    key_features = 1

    if infoVec == 'corr':
        maxVL_OPS = 1

    for NVL_OPS in range(1, maxVL_OPS + 1):

        if verbose == 1:
            stdout = 'Running OPS ' + str(int(
                (NVL_OPS / maxVL_OPS) * 100)) + '%/' + str(
                    int((maxVL_OPS / maxVL_OPS) * 100)) + '%'
            print(stdout)

        iterResults = np.empty([0, 5])
        iterSelFeats = {}

        if infoVec == 'corr':
            corrVec = correlationVector(X, Y)
            informativeVector = formatInfoVec(corrVec, nFeatures)
        elif infoVec == 'reg':
            regVec = regressionVector(X, Y, NVL_OPS)
            informativeVector = formatInfoVec(regVec, nFeatures)
        else:
            corrVec = correlationVector(X, Y)
            regVec = regressionVector(X, Y, NVL_OPS)
            prodVec = regVec * corrVec
            informativeVector = formatInfoVec(prodVec, nFeatures)

        _, mapping = sortVariables(X, Y, informativeVector)

        for NV in range(1, maxVariables + 1):
            subset = mapping[0:NV]
            for NVL_Model in range(1, maxVL_Model + 1):

                if NV < NVL_Model:
                    continue
                q2, _, _, _, _, _ = cv.plsLOO(X[:, subset], Y, NVL_Model)

                metrics = np.array([q2, NV, NVL_Model, NVL_OPS, key_features])
                iterSelFeats[key_features] = subset
                key_features = key_features + 1
                iterResults = np.vstack((iterResults, metrics))

        bestModelIndex = np.argmax(iterResults[:, 0])
        bestModel = iterResults[bestModelIndex, :]
        if verbose == 1:
            print('Iteration\'s best model:',
                  'Q2:', '{0:.4f}'.format(bestModel[0]), 'NV:',
                  int(bestModel[1]), 'NVL_Model:', int(bestModel[2]),
                  'NVL_OPS:', int(bestModel[3]))
        results = np.vstack((results, bestModel))
        selFeats[bestModel[4]] = iterSelFeats[bestModel[4]]

    maxPerf = np.argmax(results[:, 0])
    q2 = results[maxPerf, 0]
    NV = results[maxPerf, 1]
    NVL_Model = results[maxPerf, 2]
    NVL_OPS = results[maxPerf, 3]
    subset = selFeats[results[maxPerf, 4]]
    if verbose == 1:
        print('Best model found:', 'Q2:', '{0:.4f}'.format(q2), 'NV:', int(NV),
              'NVL_Model:', int(NVL_Model), 'NVL_OPS:', int(NVL_OPS))
    return subset, q2, NV, NVL_Model, NVL_OPS
示例#3
0
文件: OPS.py 项目: ramongonze/ops
def OPS_auto(X, Y, NVL_OPS, maxVL_Model, maxVariables, verbose=0):
    """
    Automated version of Ordered Predictors Selection (OPS): 
    feature selection algorithm for multivariate regression. For details about OPS, please see reference [1].

    This implemetation automats the process testing all available informative vectors for sorting variables.
    It's runs slowly than the previous implementation.

    Parameters
    ----------
    X : numpy array of shape [n_samples,n_features]
        Samples.
    Y : numpy array of shape [n_samples]
        Target values.
    NVL_OPS: integer
        Maximum number of latent variables tested in OPS for sorting varables (when using 'prod' or 'reg' infoVec).
    maxVL_Model: integer
        Maximum number of latent variables tested in PLS models with each feature subset.
    maxVariables: integer
        Maximum number of features tested.
    verbose : {0,1}
        0 Verbosity mode off.
        1 Verbosity mode on.


    Returns
    -------
    subset : integer list
        Index columns of selected features.
    q2 : float
        Q^2 external validation metric. For details see reference [2].
    NV : int
        Number of selected features.
    NVL_Model: float
        Number of PLS latent variables.
    NVL_OPS : int
        Number of OPS latent variables used for sorting variables.
    infoVec : int
        Returns the informative vector used for sorting variables in the best model found.
        1 if it's correlation vector.
        2 if it's regression vector.
        3 if it's combination of correlation and regression vectors
       
    
    References
    ----------
        [1] Teófilo, R. F.; Martins, J. P. A. & Ferreira, M. M. C. (2009).
        Sorting variables by using informative vectors as a strategy for feature selection in multivariate regression. 
        Journal of Chemometrics, 23(1):32--48. ISSN 0886-9383

        [2] Martins, J. P. A.; Barbosa, E. G.; Pasqualoto, K. F. M. & Ferreira, M. M. C. (2009).
        LQTA-QSAR: a new 4D-QSAR methodology. 
        Journal of Chemical Information and Modeling, 49(6):1428--1436. ISSN 1549-9596
    """


    nSamples, nFeatures = X.shape
    if maxVariables > nFeatures:
        maxVariables = nFeatures


    # Best models are saved in this matrix, each line is formatted as 
    # [Q^2, NumVariables, numLatentVariables (Model), NumLatentVariables (OPS), Key of features subset, Informative Vector]
    results = np.empty([0, 6])

    selFeats = {}
    key_features = 1

    iterResults = np.empty([0, 6])
    iterSelFeats = {}

    regVec = regressionVector(X, Y, NVL_OPS)
    corrVec = correlationVector(X, Y)
    prodVec = regVec * corrVec

    # Begin testing with correlation vetor
    informativeVector = formatInfoVec(corrVec, nFeatures)
    _, mapping = sortVariables(X, Y, informativeVector)
    for NV in range(1, maxVariables + 1):
        subset = mapping[0:NV]
        for NVL_Model in range(1, maxVL_Model + 1):

            if NV < NVL_Model:
                continue
            q2, _, _, _, _, _ = cv.plsLOO(X[:, subset], Y, NVL_Model)

            metrics = np.array([q2, NV, NVL_Model, NVL_OPS, key_features, 1])
            iterSelFeats[key_features] = subset
            key_features = key_features + 1
            iterResults = np.vstack((iterResults, metrics))
    # End testing with correlation vetor

    # Begin testing with regression vetor
    informativeVector = formatInfoVec(regVec, nFeatures)
    _, mapping = sortVariables(X, Y, informativeVector)
    for NV in range(1, maxVariables + 1):
        subset = mapping[0:NV]
        for NVL_Model in range(1, maxVL_Model + 1):

            if NV < NVL_Model:
                continue
            q2, _, _, _, _, _ = cv.plsLOO(X[:, subset], Y, NVL_Model)

            metrics = np.array([q2, NV, NVL_Model, NVL_OPS, key_features, 2])
            iterSelFeats[key_features] = subset
            key_features = key_features + 1
            iterResults = np.vstack((iterResults, metrics))
    # End testing with regression vetor

    # Begin testing with product vetor
    informativeVector = formatInfoVec(prodVec, nFeatures)
    _, mapping = sortVariables(X, Y, informativeVector)
    for NV in range(1, maxVariables + 1):
        subset = mapping[0:NV]
        for NVL_Model in range(1, maxVL_Model + 1):

            if NV < NVL_Model:
                continue
            q2, _, _, _, _, _ = cv.plsLOO(X[:, subset], Y, NVL_Model)

            metrics = np.array([q2, NV, NVL_Model, NVL_OPS, key_features, 3])
            iterSelFeats[key_features] = subset
            key_features = key_features + 1
            iterResults = np.vstack((iterResults, metrics))
    # End testing with product vetor

    bestModelIndex = np.argmax(iterResults[:, 0])
    bestModel = iterResults[bestModelIndex, :]
    
    if verbose == 1:
        print('Iteration\'s best model:', 'Q2:', '{0:.4f}'.format(bestModel[0]), 'NV:', int(bestModel[1]), 'NVL_Model:', int(bestModel[2]), 'NVL_OPS:', int(bestModel[3]), 'InfoVec', int(bestModel[5]))
    results = np.vstack((results, bestModel))
    selFeats[bestModel[4]] = iterSelFeats[bestModel[4]]

    maxPerf = np.argmax(results[:, 0])
    q2 = results[maxPerf, 0]
    NV = results[maxPerf, 1]
    NVL_Model = results[maxPerf, 2]
    NVL_OPS = results[maxPerf, 3]
    infoVec = results[maxPerf, 5]
    subset = selFeats[results[maxPerf, 4]]

    return subset, q2, NV, NVL_Model, NVL_OPS, infoVec