Пример #1
0
def optimize(data, number_features, threshold, corr=None, batch=False):
    '''
    This function computes the optimal set of odorants with a more efficient
    approach that includes two possible pre-processing steps of the data
    before a linear optimization is applied.

    First, one decides to omit highly correlated pairs of odorants from the
    data-set. Normally, these only contain redundant information. To activate
    set corr=True.

    In a second step, a fast backward elimination is performed to find a larger
    set (threshold), e.g. all other odorants are removed as well.

    Gurobi is then applied on the remaining odorants. If the values are chosen
    carefully, one can drastically speed up the computation of the optimal
    number_features.

    In my thesis I showed, that backward elimination is often close to optimal
    but much faster than Gurobi, so theoretically a combination of both should
    boost computation without losing stability.

    Args:
        data (numpy.array):
            The olfactory data matrix. x-dimension: receptors,
            y-dimension: odorants.
        number_features (int):
            Specifies the desired size of the optimal set.
        threshold (int):
            Speficies how many number_features are computed by backward_elimination.
        corr (bool):
            (default=False). If true, pairwise correlation between
            number_features is computed and one feature from pairs with high correlation
            is removed.

    Returns:
        f_list (numpy.array):
            The optimal odorant set. Odorants are sorted increasingly by odorants
            standard deviation.
        score (float):
            The score. Higher scores are better.
    '''

    if threshold <= number_features:
        print "threshold must be larger than number_features"

    removables = []

    if corr:
        c = analysis.correlation(np.transpose(data))

        l = c[c[2] > corr]
        for i in l:
            if np.std(i[0]) > np.std(i[1]):
                removables.append(i[0])
            else:
                removables.append(i[1])

    f_list = np.setdiff1d(range(data.shape[1]), removables)

    f_tmp, score = featureselection.backward_elimination(
        data[:, f_list], threshold)
    # keep number_features computed by backward elimination
    f_list = np.intersect1d(f_list, f_list[f_tmp])

    f, s = gurobi.optimize(data[:, f_list],
                           number_features=number_features,
                           batch=batch)

    for i, v in enumerate(f):
        f[i] = f_list[v]

    return f, s
Пример #2
0
        if j > i:
            c = sp.stats.pearsonr(rowA, rowB)[0]
            if c > 0.92:

                if np.std(rowA) > np.std(rowB):
                    removables.append(j)
                else:
                    removables.append(i)

removables = np.unique(removables)
print "remove (%s)" % (len(removables)), removables

data = np.delete(data, removables, axis=0)
feature_names = np.delete(feature_names, removables)
data = np.transpose(data)
feature_list, scores = featureselection.backward_elimination(data)

title = 'Backward Elimination with preprocessing of Hallem with %s features' % str(
    features)
sub_list = feature_list[:features]
print title
print "Score:", scores[features]
print sub_list
print feature_names[sub_list]


#path = "../figures/hallem/be/hallem_be_" + str(features) + "_performance_preprocessing.png"
#plotting.plot_progress_results(scores, features, path=path)
#
#path = "../figures/hallem/be/hallem_be_" + str(features) + "_preprocessing.png"
#plotting.plot_fingerprints(title,
Пример #3
0
#!/usr/bin/env python
# encoding: utf-8
import numpy as np
from example.mouse.data import MouseData
from _core import plotting
from _core.featureselection import backward_elimination

mouse = MouseData()
data = mouse.response
feature_names = mouse.feature_names
data_names = mouse.data_names

feature_list, score = backward_elimination(data)
features = 6
title = 'Backward Elimination on Mouse'
print title
print "Score:", score[-1]
print feature_names[feature_list]

path = "/Users/marcus/Desktop/mouse_be_" + str(features) + "_performance.png"
plotting.plot_progress_results(score, features, path)

#path = "/Users/marcus/Desktop/mouse_be_" + str(features) + ".png"
#plotting.plot_fingerprints(title, feature_names[feature_list],
#                           data[:, feature_list],
#                           data_names)
Пример #4
0
"""
Analysis of stability of backward prediction on DoOR dorsal dataset.
"""
import datetime
import numpy as np
import matplotlib.pyplot as pl
from example.door import DoOR
from _core import featureselection, validation

door = DoOR()

data, ors, odorants = door.get_dorsal_data()

# compute features
print data.shape
feature_list, scores = featureselection.backward_elimination(data)

# creating a list of feature-lists with increasing size
top = []
for i in range(15):
    top.append((feature_list[:i])[::-1])

print top

# # levels of noise which will be added
sd_range = np.arange(0, 0.15, 0.01)
results = validation.validate(data, top, noise=sd_range)

x = sd_range
y = np.asarray(range(1, results.shape[0] + 1))
X, Y = np.meshgrid(x, y)
Пример #5
0
rounds = 5


def write_csv(f, matrix):
    x = np.column_stack((np.transpose(size_data), matrix))
    v = np.hstack(["Size", size_feat])

    with open(f, 'wb') as ff:
        writer = csv.writer(ff, delimiter=";")
        writer.writerow(v)
        writer.writerows(x)


for k in range(1, rounds + 1):
    for i, v_i in enumerate(size_data):
        for j, v_j in enumerate(size_feat):
            d = sample_generator.generate_random_data(v_i, v_j)

            start = time.time()
            feature_list, score = backward_elimination(d, 4)
            stop = time.time()

            delta = stop - start
            times[i, j] += delta
            scores[i, j] += score
            print "Round %s, data: %s, features: %s, time: %s" \
                  % (k, v_i, v_j, round(delta, 4))

            write_csv("../results/be_times_pew.csv", times / k)
            write_csv("../results/be_scores_pew.csv", scores / k)
Пример #6
0
def optimize(data, number_features, threshold, corr=None, batch=False):
    '''
    This function computes the optimal set of odorants with a more efficient
    approach that includes two possible pre-processing steps of the data
    before a linear optimization is applied.

    First, one decides to omit highly correlated pairs of odorants from the
    data-set. Normally, these only contain redundant information. To activate
    set corr=True.

    In a second step, a fast backward elimination is performed to find a larger
    set (threshold), e.g. all other odorants are removed as well.

    Gurobi is then applied on the remaining odorants. If the values are chosen
    carefully, one can drastically speed up the computation of the optimal
    number_features.

    In my thesis I showed, that backward elimination is often close to optimal
    but much faster than Gurobi, so theoretically a combination of both should
    boost computation without losing stability.

    Args:
        data (numpy.array):
            The olfactory data matrix. x-dimension: receptors,
            y-dimension: odorants.
        number_features (int):
            Specifies the desired size of the optimal set.
        threshold (int):
            Speficies how many number_features are computed by backward_elimination.
        corr (bool):
            (default=False). If true, pairwise correlation between
            number_features is computed and one feature from pairs with high correlation
            is removed.

    Returns:
        f_list (numpy.array):
            The optimal odorant set. Odorants are sorted increasingly by odorants
            standard deviation.
        score (float):
            The score. Higher scores are better.
    '''

    if threshold <= number_features:
        print "threshold must be larger than number_features"

    removables = []

    if corr:
        c = analysis.correlation(np.transpose(data))

        l = c[c[2] > corr]
        for i in l:
            if np.std(i[0]) > np.std(i[1]):
                removables.append(i[0])
            else:
                removables.append(i[1])

    f_list = np.setdiff1d(range(data.shape[1]), removables)

    f_tmp, score = featureselection.backward_elimination(data[:, f_list],
                                                         threshold)
    # keep number_features computed by backward elimination
    f_list = np.intersect1d(f_list, f_list[f_tmp])

    f, s = gurobi.optimize(data[:, f_list], number_features=number_features,
                           batch=batch)

    for i, v in enumerate(f):
        f[i] = f_list[v]

    return f, s
Пример #7
-1
def batch_compute(job_id, features, methods):

    current_app.logger.debug("Computation for " + job_id)
    matrix = model.load_job_data(job_id)

    data = np.asarray(matrix[1:, 1:], dtype=float)
    for m in methods:
        if m == methodsmap.backward_elimination["key"]:
            try:
                f, r = featureselection.backward_elimination(data)
                f = [str(f[:i].tolist()) for i in range(features)]
                save_results(job_id, f[:features], r[:features],
                                  features, m)
            except Exception as inst:
                current_app.logger.error(inst)
                abort(500)
        elif m == methodsmap.forward_selection["key"]:
            try:
                f, r = featureselection.forward_selection(data)
                f = [str(f[:i].tolist()) for i in range(features)]
                save_results(job_id, f[:features], r[:features],
                                  features, m)
            except Exception as inst:
                current_app.logger.error(inst)
                abort(500)
        elif m == methodsmap.gurobi["key"]:
            try:
                f, r = gurobi.optimize(data,
                                       number_features=features,
                                       batch=True)

                #convert arrays to string
                f = [str(i.tolist()) for i in f]
                save_results(job_id, f, r, features, m)
            except Exception as inst:
                current_app.logger.error(inst)
                abort(500)
        elif m == methodsmap.pipelining["key"]:
                threshold = features + 1 if len(data[0]) > features else len(data[0])
                f, r = pipelining.optimize(data,
                                       number_features=features,
                                       threshold=threshold,
                                       corr=0.94,
                                       batch=True)
                #convert arrays to string
                f = [str(i.tolist()) for i in f]
                save_results(job_id, f, r, features, m)
            #except Exception as inst:
            #    current_app.logger.error(inst)
            #    abort(500)
        else:
            pass