Exemplos de Fitter em Python, exemplos de fitter.Fitter em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: fit_distribution.py Projeto: Gasia44/Applied-Statistics

def fit_distributions(distributions, data):
    """
    fit distributions on data
    :param distributions:
    :param data:
    :return:
    """
    if distributions[0] == "ALL":
        f = fitter.Fitter(data, distributions=None, verbose=True)
    else:
        f = fitter.Fitter(data, distributions=distributions, verbose=True)
    f.fit()
    return f

Exemplo n.º 2

0

Exibir arquivo

Arquivo: metadata.py Projeto: Tuanlase02874/Machine-Learning-Kaggle

def distribution_helper(data_list, distribution_list):
    distribution_fit_object = fitter.Fitter(data_list,
                                            distributions=distribution_list,
                                            timeout=600,
                                            verbose=False)
    distribution_fit_object.fit()
    error_map = distribution_fit_object.df_errors.to_dict()
    distribution_error_map = error_map['sumsquare_error']
    return distribution_error_map

Exemplo n.º 3

0

Exibir arquivo

Arquivo: main.py Projeto: minoruronny79/RAISA_WebAPP

def filterSummary():
    #Table by variable
    if filterSelect.value != "No Filter":
        basins = file1[filterSelect.value].unique()
        lista_basin = []
        lista_results_basin = []
        lista_mean = []
        lista_std = []
        lista_generador = []
        for z in basins:
            basin_df = file1[file1[filterSelect.value] == z]
            lista_basin.append(z)
            try:
                basin_dis = fitter.Fitter(basin_df[targetVar.value],
                                          distributions=[
                                              "gamma", "uniform", "lognorm",
                                              "norm", "expon", "exponnorm",
                                              "logistic", "lognorm", "triang"
                                          ])
                #basin_dis=fitter.Fitter(basin_df[targetVar.value], distributions=["gamma", "uniform", "lognorm"])
                basin_dis.fit()
                basin_bestfit = basin_dis.get_best()
                print("A1")
                base = generador_estadistica_2(basin_bestfit)
                print(base)
                lista_mean.append(base['mean'])
                lista_std.append(base['std'])
                lista_results_basin.append(basin_bestfit)
                lista_generador.append(base["generador"])
            except:
                lista_results_basin.append("Action can not be performed")

        aux_bestdist_basin = {
            filterSelect.value: lista_basin,
            "Best Distribution": lista_results_basin,
            "Mean": lista_mean,
            "std": lista_std
        }
        aux_data_gen = {
            filterSelect.value: lista_basin,
            "generador": lista_generador
        }
        bestdist_basin = pd.DataFrame(aux_bestdist_basin)
        data_gen = pd.DataFrame(aux_data_gen)

        #bestdist_basin=generador_estadistica(bestdist_basin)
        #bestdist_basin2=generador_estadistica(bestdist_basin)
        return bestdist_basin, data_gen
    return 0

Exemplo n.º 4

0

Exibir arquivo

    def _leaderboard_compute_overall_score(self, N=100):
        """Based on NULL distribution, compute overall score of model1

        Not finalised.

        """
        self._compute_pvalues_pred1(N=N)
        self._compute_pvalues_param1(N=N)
        import fitter
        fit_param1 = fitter.Fitter(self.rdistance_param1)
        fit_param1.distributions = ['beta']
        fit_param1.fit()
        fit_pred1 = fitter.Fitter(self.rdistance_pred1)
        fit_pred1.distributions = ['beta']
        fit_pred1.fit()

        import scipy.stats
        self.pvalues_param1 = scipy.stats.beta.cdf(self.scores['param1'].scores,
                *fit_param1.fitted_param['beta'])
        self.pvalues_pred1 = scipy.stats.beta.cdf(self.scores['pred1'].scores,
                *fit_pred1.fitted_param['beta'])

        self.scores['pred1']['pvalues'] = self.pvalues_pred1
        self.scores['param1']['pvalues'] = self.pvalues_param1

Exemplo n.º 5

0

Exibir arquivo

Arquivo: main.py Projeto: minoruronny79/RAISA_WebAPP

def analysis(variable):
    try:
        #global file1
        varinput = variable
        #varinput = file1[variable]
        #targetVar.on_change('value', updateHisto)
        distfit_var = fitter.Fitter(variable,
                                    distributions=[
                                        "gamma", "uniform", "lognorm", "norm",
                                        "expon", "exponnorm", "logistic",
                                        "lognorm", "triang"
                                    ])
        #distfit_var = fitter.Fitter(variable, distributions=["gamma", "uniform", "lognorm"])
        distfit_var.fit()
        sumario = distfit_var.summary(plot=False, Nbest=8)
        #bestdist=distfit_var.get_best()
        #tabla=datatable_var(sumario)
        return sumario, distfit_var
    except:
        print("This is not a numeric variable")

Exemplo n.º 6

0

Exibir arquivo

    def __init__(self, pfad, **kwargs):
        """ Initialize attributes
        
        and example docstring
        Args:
            pfad (str): File path of image
        
        Kwargs:
            none (yet)
            
        Returns:
            nothing
            
        Creates:
            self.att (dic): empty dictionary for holding image attributes
            
        Raises:
            nothing
            
        Use me if you want to create an image
        
        """
        self.att = {}  #attribute directory
        self.rnio = rnio.RnIo()
        self.fitter = fitter.Fitter()

        #checks if pfad is valid
        if os.path.exists(pfad) == True:
            self.pfad = str(pfad)
            logging.info('Image pfad was set: %s', pfad)
            self.calc_name()
            with Bild.lock:
                self.att['bid'] = Bild.bid_count
                Bild.bid_count += 1
        else:
            logging.warning('No file found under pfad %s. No image was opened',
                            pfad)

Exemplo n.º 7

0

Exibir arquivo


# Make environments
envs = [gym.make("Pong-v0") for i in range(n_envs)]
for i,env in enumerate(envs):
    env.seed(i)
obs_bookmarks = [env.reset() for env in envs] # Used to track observations between environments
prev_bookmarks = [0 for i in range(n_envs)]

# Make model and optimizer
action_dim = 2 # Pong specific number of possible actions
prepped_state = preprocess(obs_bookmarks[0]) # Returns a vector representation of the observation
input_dim = prepped_state.shape[0]
net = model.Model(input_dim, action_dim)
optimizer = optim.Adam(net.parameters(), lr=lr)
fit_obj = fitter.Fitter(input_dim, action_dim, n_olddatas=n_olddatas)

if resume:
    net.load_state_dict(torch.load(net_save_file))
    optimizer.load_state_dict(torch.load(optim_save_file))

optimizer.zero_grad()

# Various functions that will be useful later
logsoftmax = nn.LogSoftmax()
softmax = nn.Softmax()
mseloss = nn.MSELoss()

# Store actions, observations, values
actions, observs, rewards, old_pis, old_vals, advantages, mask = [], [], [], [], [], [], []
episode_reward = 0

Exemplo n.º 8

0

Exibir arquivo

Arquivo: Laba1Done.py Projeto: TolyaPapa/Laba1

def Solve():
    g = open('OutPut.txt', "w")
    pp = PdfPages("AllHistogram.pdf")
    np.seterr(divide='ignore', invalid='ignore')
    if not sys.warnoptions:
        import warnings
        warnings.simplefilter("ignore")  # ignore some warnings from system
    cnt = 0  # count how many images had taken
    for filename in glob.glob(
            'D:/10semester/Progonov/Лабораторные работы/mirflickr/*.jpg'):
        photo = Image.open(filename)
        photo = photo.convert('RGB')
        g.write('Output for Image number {}\n\n'.format(cnt + 1))
        Red = []
        Green = []
        Blue = []

        width, height = photo.size  # define W and H
        for y in range(0, height):  # each pixel has coordinates
            for x in range(0, width):
                RGB = photo.getpixel((x, y))
                R, G, B = RGB  # now we can use the RGB value
                Red.append(R)
                Green.append(G)
                Blue.append(B)

        sorted(Red)
        sorted(Green)
        sorted(Blue)

        g.write('Max and min values of Red channel of image {} are: {}, {}\n'.
                format(cnt + 1, max(Red), min(Red)))
        g.write(
            'Max and min values of Green channel of image {} are: {}, {}\n'.
            format(cnt + 1, max(Green), min(Green)))
        g.write(
            'Max and min values of Blue channel of image {} are: {}, {}\n\n'.
            format(cnt + 1, max(Blue), min(Blue)))

        # for Red channel

        g.write('Sum of Red channel is : {}\n'.format(sum(Red)))
        g.write('Median of Red channel is : {}\n'.format(stt.median(Red)))
        g.write('Lower and Upper quantile of Red channel are : {} {}\n'.format(
            np.quantile(Red, 0.25), np.quantile(Red, 0.75)))
        g.write('Mean value is : {}\n'.format(stt.mean(Red)))
        g.write('Skewness and Kurtosis are : {} {}\n'.format(
            skew(np.array(Red)), kurtosis(Red)))
        g.write('Average value of Red channel is : {}\n'.format(
            sum(Red) / (width * height)))
        g.write('The Variance of Red channel is  : {}\n\n'.format(
            stt.variance(Red)))
        # ================================================
        # for Green channel

        g.write('Sum of Green channel is : {} \n'.format(sum(Green)))
        g.write('Median of Green channel is : {}\n'.format(stt.median(Green)))
        g.write(
            'Lower and Upper quantile of Green channel are : {} {}\n'.format(
                np.quantile(Green, 0.25), np.quantile(Green, 0.75)))
        g.write('Mean value is : {}\n'.format(stt.mean(Green)))
        g.write('Skewness and Kurtosis are : {} {}\n'.format(
            skew(np.array(Green)), kurtosis(Green)))
        g.write('Average value of Green channel is : {}\n'.format(
            sum(Green) / (width * height)))
        g.write('The Variance of Green channel is  : {}\n\n'.format(
            stt.variance(Green)))
        # =====================================================
        # for Blue channel

        g.write('Sum of Blue channel is : {}\n'.format(sum(Blue)))
        g.write('Median of Blue channel is : {}\n'.format(stt.median(Blue)))
        g.write(
            'Lower and Upper quantile of Blue channel are : {} {}\n'.format(
                np.quantile(Blue, 0.25), np.quantile(Blue, 0.75)))
        g.write('Mean value is : {}\n'.format(stt.mean(Blue)))
        g.write('Skewness and Kurtosis are : {} {}\n'.format(
            skew(np.array(Blue)), kurtosis(Blue)))
        g.write('Average value of Blue channel is : {}\n'.format(
            sum(Blue) / (width * height)))
        g.write('The Variance of Blue channel is  : {}\n\n'.format(
            stt.variance(Blue)))
        photo.close()

        for name, num in COLOR.items():
            plt.figure()
            photo = np.array(Image.open(filename))
            a = photo[:, :, num].ravel()
            f = fitter.Fitter(
                a,
                distributions=['beta', 'gamma', 'uniform', 'norm', 'laplace'],
                bins=256,
                verbose=False)
            f.fit()
            g.write("Fitted errors for " + name + " channel:\n\n")
            for k, v in f._fitted_errors.items():
                g.write(str(k) + ' >>> ' + str(v) + '\n')
            g.write("\n")
            f.summary()
            f.hist()
            plt.title(str(name + " channel of Image number " + str(cnt + 1)))
            pp.savefig()
        plt.close('all')
        g.write('=================================================\n\n')
        cnt += 1
        if cnt >= 1:
            break
    pp.close()
    g.close()

Exemplo n.º 9

0

Exibir arquivo

Arquivo: anova.py Projeto: Donnyvdm/gdsctools

    def anova_one_drug_one_feature(self,
                                   drug_id,
                                   feature_name,
                                   show=False,
                                   production=False,
                                   directory='.',
                                   fontsize=18):
        """Compute ABOVA one drug and one feature level

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show boxplots with the different factor used
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of the analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s' %
                             (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s' %
                             (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = odof.drug_id
            results['DRUG_NAME'] = odof.drug_name
            results['DRUG_TARGET'] = odof.drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # with newer version of pandas (v0.19), None are not accepted
                # anymore
                for k in results.keys():
                    if results[k] is None:
                        results[k] = np.nan
                df = pd.DataFrame(results, index=[1])
                return df

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical components.

        # If a formula is provided, use statsmodels. Since it is slowish,
        # we implemented several cases as described in the doc for the 4
        # following cases:
        # - TISSUE + MSI +MEDIA + FEATURE
        # - TISSUE + MSI + FEATURE
        # - MSI + FEATURE
        # - FEATURE
        if self.settings.regression_formula not in ["auto", None, ""]:
            # This populates the anova_pvalues attribute itself
            _ = self.anova_one_drug_one_feature_custom(
                drug_id,
                feature_name,
                formula=self.settings.regression_formula,
                odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.loc[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]

            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)
            tissues = [x for x in df.columns if x.startswith('C(tissue')]
            df.drop(tissues[0], axis=1, inplace=True)
            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                # make sure the media factor is not included
                todrop = [x for x in df.columns if x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)
            else:
                # drop the first one for the regression
                medias = [x for x in df.columns if x.startswith('C(media')]
                if len(medias):
                    df.drop(medias[0], axis=1, inplace=True)
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features

            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        elif self.settings.include_MSI_factor is True:
            df = DummyDF()
            df.values = np.ones((3, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_msi.values
            df.values[2] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)
        else:
            df = DummyDF()
            df.values = np.ones((2, odof.Npos + odof.Nneg))
            df.values[1] = odof.masked_features
            df.values = df.values.T
            # The regression itself
            self.data_lm = OLS(odof.Y, df.values).fit()
            # The ANOVA itself
            self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                         odof=odof)
            results = self._set_odof_results(self.anova_pvalues, odof)

        key = str(drug_id) + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):
                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm,
                                                        output='dict',
                                                        odof=odof)
                try:
                    self.samples1.append(anova_pvalues['msi'])
                except:
                    pass
                self.samples2.append(anova_pvalues['feature'])
                try:
                    self.samples3.append(anova_pvalues['tissue'])
                except:
                    pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.loc[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N': len(Y)
            }

        if show is True:
            boxplot = BoxPlots(odof,
                               savefig=self.settings.savefig,
                               directory=directory,
                               fontsize=fontsize)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')
            if self.settings.include_media_factor:
                boxplot.boxplot_pancan(fignum=3, mode='media')

        # about 30% of the time spent in creating the DataFrame...
        if production is True:
            return results
        else:
            # with newer version of pandas (v0.19), None are not accepted
            # anymore
            for k in results.keys():
                if results[k] is None:
                    results[k] = np.nan
            df = pd.DataFrame(results, index=[1])
            return df

Exemplo n.º 10

0

Exibir arquivo

Arquivo: sequence_length_fitting.py Projeto: cokelaer/euroscipy_proceedings

entries = pd.read_csv("/home/cokelaer/entries.txt", header=None)
entries = list(entries[0].as_matrix())

# This command takes a while: about 20 minutes with a good connection.
# This will download lots of fields from uniprot for each entry.
# Later on we will play with the sequence length, which could
# have been extracted from the downloaded file but this example
# if for illustration.

# obtain a dataframe filled with all data from all entries
df = u.get_df()

# let us build a vector made of the length of the sequence.
# we restrict ourself to 3000 nucleotides
data = df[df.Length < 3000].Length

# now, we may want to figure out wha kind of distribution this sample is conng
# from. We will use the package called fitter, available on pypi with a layer
# built on top of scipy (distribution and fit)
import fitter
f = fitter.Fitter(data, bins=150)
f.distributions = ['lognorm', 'chi2', 'rayleigh', 'cauchy', 'invweibull']
f.fit()
f.summary()
f.summary(lw=3)
xlabel("Sequence length", fontsize=20)
ylabel("PDF", fontsize=20)
savefig("sequence_length_fitting.png", dpi=200)
savefig("sequence_length_fitting.eps", dpi=200)
savefig("sequence_length_fitting.svg", dpi=200)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: simulador_productividad.py Projeto: eliasft/projects

import dca

import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import fitter

dca.productividad()

from dca import gasto

data = gasto[gasto.mes_max > 50]

f_Qi = fitter.Fitter(data.Qi_hist, timeout=120)
f_Qi.fit()
best_Qi = f_Qi.get_best()
Qi_params = list(best_Qi.values())[0]

f_di = fitter.Fitter(data.di_hyp, timeout=120)
f_di.fit()
best_di = f_di.get_best()
di_params = list(best_di.values())[0]

f_b = fitter.Fitter(data.b, timeout=120)
f_b.fit()
best_b = f_b.get_best()
b_params = list(best_b.values())[0]

#print(f_Qi.summary())

Exemplo n.º 12

0

Exibir arquivo

    def anova_one_drug_one_feature(self,
                                   drug_id,
                                   feature_name,
                                   show=False,
                                   production=False,
                                   directory='.'):
        """Compute ANOVA and various tests on one drug and one feature

        :param drug_id: a valid drug identifier
        :param feature_name: a valid feature name
        :param bool show: show some plots
        :param str directory: where to save the figure.
        :param bool production: if False, returns a dataframe otherwise
            a dictionary. This is to speed up analysis when scanning
            the drug across all features.

        .. note:: **for developer** this is the core of tha analysis
            and should be kept as fast as possible. 95% of the time is spent
            here.

        .. note:: **for developer** Data used in this function comes from
            _get_one_drug_one_feature_data method, which should also be kept
            as fast as possible.
        """
        if drug_id not in self.drugIds:
            raise ValueError('Unknown drug name %s. Use e.g., %s' %
                             (drug_id, self.drugIds[0]))

        if feature_name not in self.feature_names:
            # we start index at 3 to skip tissue/name/msi
            raise ValueError('Unknown feature name %s. Use e.g. one of %s' %
                             (feature_name, self.feature_names[0:3]))

        # This extract the relevant data and some simple metrics
        # This is now pretty fast accounting for 45 seconds
        # for 265 drugs and 988 features
        odof = self._get_one_drug_one_feature_data(drug_id, feature_name)
        drug_name = self.drug_decode.get_name(drug_id)
        drug_target = self.drug_decode.get_target(drug_id)

        # if the status is False, it means the number of data points
        # in a category (e.g., positive feature) is too low.
        # If so, nothing to do, we return an 'empty' dictionary
        if odof.status is False:
            results = self._odof_dict.copy()
            results['FEATURE'] = feature_name
            results['DRUG_ID'] = drug_id
            results['DRUG_NAME'] = drug_name
            results['DRUG_TARGET'] = drug_target
            results['N_FEATURE_pos'] = odof.Npos
            results['N_FEATURE_neg'] = odof.Nneg
            if production is True:
                # return a dict
                return results
            else:
                # or a dataframe; note that index is not relevant here but
                # required.
                df = pd.DataFrame(results, index=[1])
                return df

        # with the data extract, we can now compute the regression.

        # In R or statsmodels, the regression code is simple since
        # it is based on the formula notation (Y~C(msi)+feature)
        # This is also possible in statsmodels library,  however,
        # this relies on patsy, which is very slow as compared to the
        # statsmodels without formula.
        #### self._mydata = pd.DataFrame({'Y':self.Y,
        ####    'tissue':self.masked_tissue,
        ####       'msi': self.masked_msi, 'feature':self.masked_features})
        #### self.data_lm = ols('Y ~ C(tissue) + C(msi) + feature',
        ####  data=self._mydata, missing='none').fit() #Specify C is category

        # IMPORTANT: the order of the factors in the formula
        # is important. It does not change the total sum of square errors
        # but may change individual effects of the categorical
        # components.

        # Instead of using ols function, we use the OLS one so we cannot
        # use formula. Instead, we need to create manually the input
        # data. In the case of categorical data (tissue), we need to
        # create the dummy variable, which is done in the constructor
        # once for all (slow otherwise).
        if self.settings.analysis_type == 'PANCAN':
            # IMPORTANT: tissues are sorted alphabetically in R aov
            # function. Same in statsmodels but capitalised names
            # are sorted differently. In R, a<b<B<c but in Python,
            # A<B<C<a<b<c. So, 'aero' tissue is before 'Bladder' in R,
            # not in python. Since in a linear regression
            # models, the order of the factor matters and the first
            # factor is used as a reference, we decided to use same
            # convention as in R.
            # see http://statsmodels.sourceforge.net/devel/contrasts.html
            # for a good explanation

            #self._mydata = pd.DataFrame({'Y': odof.Y.copy(),
            #    'tissue':odof.masked_tissue,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm2 = ols('Y ~ C(tissue) + C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical

            # from statsmodels.stats.anova import anova_lm
            # import statsmodels.formula.api as smf
            # df  = pd.DataFrame({'Y': odof.Y.copy(),
            #   'tissue':odof.masked_tissue,'media'
            #    odof.masked_media, 'msi':  odof.masked_msi,
            #   'feature': odof.masked_features})
            # lm = smf.ols('Y~C(tissue)+C(media)+C(msi)+feature',
            #    data=df).fit()
            #  anova_lm(lm)
            # The code above gives same answer as the code in gdsctools
            # but is slower

            # We could use pd.get_dummies but pretty slow
            # instead we create the full matrix in init() method.
            # One issue is that some columns end up with sum == 0
            # and needs to be dropped.
            df = self._tissue_dummies.ix[odof.masked_tissue.index]
            todrop = df.columns[df.values.sum(axis=0) == 0]
            if len(todrop) > 0:  # use if since drop() is slow
                df = df.drop(todrop, axis=1)

            # Here we set other variables with dataframe columns' names as
            # expected by OLS.
            if self.settings.include_media_factor == False:
                todrop = [x for x in df.columns if x.startswith('C(media)')]
                df = df.drop(todrop, axis=1)

            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values

            self.Y = odof.Y
            self.EV = df.values
            # The regression and anova summary are done here
            #
            """if self.settings.regression_method == 'ElasticNet':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=self.settings.regression_L1_wt)
            elif self.settings.regression_method == 'OLS':
                self.data_lm = OLS(odof.Y, df.values).fit()
            elif self.settings.regression_method == 'Ridge':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=0)
            elif self.settings.regression_method == 'Lasso':
                self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                        alpha=self.settings.regression_alpha,
                        L1_wt=1)
            """
            # example of computing null model ?
            # Example of computing pvalues ourself
            # with 100 000 samples, we can get a smooth distribution
            # that we can then fit with fitter. good distribution
            # for the raw data is uniform one but if we take the log10,
            # we have lots of possible distrob such as beta, exponweib, gamma,
            #....
        elif self.settings.include_MSI_factor is True:
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'msi':  odof.masked_msi, 'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ C(msi) + feature',
            #    data=self._mydata).fit() #Specify C for Categorical
            df = pd.DataFrame()
            df['C(msi)[T.1]'] = odof.masked_msi.values
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
        else:
            df = pd.DataFrame()
            df['feature'] = odof.masked_features.values
            df.insert(0, 'Intercept', [1] * (odof.Npos + odof.Nneg))
            #self.data_lm = OLS(odof.Y, df.values).fit()
            #self._mydata = pd.DataFrame({'Y': odof.Y,
            #    'feature': odof.masked_features})
            #self.data_lm = ols('Y ~ feature',
            #    data=self._mydata).fit() #Specify C for Categorical

        if self.settings.regression_method == 'ElasticNet':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha,
                L1_wt=self.settings.regression_L1_wt)
        elif self.settings.regression_method == 'OLS':
            self.data_lm = OLS(odof.Y, df.values).fit()
        elif self.settings.regression_method == 'Ridge':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha, L1_wt=0)
        elif self.settings.regression_method == 'Lasso':
            self.data_lm = OLS(odof.Y, df.values).fit_regularized(
                alpha=self.settings.regression_alpha, L1_wt=1)

        key = drug_id + "__" + feature_name
        if self.sampling and key not in self.pvalues_features.keys():
            # This can be computed for a drug once for all
            # no need to redo it for each feature ?
            # If the length of Y is too small (e.g., < 20) the results may not be
            # great. This can be check zith the errors
            self.samples1 = []
            self.samples2 = []
            self.samples3 = []
            Y = odof.Y.copy()
            N = self.sampling
            pb = Progress(N, 20)
            for i in range(0, N):

                # To get the random distribution, shuffle Y
                # and noise not required
                # To get the noise effects, do not shuffle and set noise to
                # something different from 0
                noise = 0.0
                pylab.shuffle(Y)
                #data_lm = OLS(Y, df.values).fit()
                data_lm = OLS(Y + noise * pylab.randn(len(Y)), df.values).fit()
                anova_pvalues = self._get_anova_summary(data_lm, output='dict')
                try:
                    self.samples1.append(anova_pvalues['msi'])
                except:
                    pass
                self.samples2.append(anova_pvalues['feature'])
                try:
                    self.samples3.append(anova_pvalues['tissue'])
                except:
                    pass
                #pb.animate(i+1)
            import fitter
            ff = fitter.Fitter(-pylab.log10(self.samples2))
            dist = "genexpon"
            ff.distributions = [dist]
            ff.fit()
            self.pvalues_features[key] = {
                'error': ff.df_errors.ix[dist].values[0],
                'params': ff.fitted_param[dist],
                'feature': feature_name,
                'N': len(Y)
            }
            print(self.pvalues_features[key])

        self.anova_pvalues = self._get_anova_summary(self.data_lm,
                                                     output='dict')

        # Store the pvalues. Note that some may be missing so we use try
        # except, which is faster than if/else
        try:
            tissue_PVAL = self.anova_pvalues['tissue']
        except:
            tissue_PVAL = None

        try:
            MSI_PVAL = self.anova_pvalues['msi']
        except:
            MSI_PVAL = None

        try:
            FEATURE_PVAL = self.anova_pvalues['feature']
        except:
            FEATURE_PVAL = None

        try:
            MEDIA_PVAL = self.anova_pvalues['media']
        except:
            MEDIA_PVAL = None

        if show is True:
            boxplot = BoxPlots(odof,
                               savefig=self.settings.savefig,
                               directory=directory)
            boxplot.boxplot_association(fignum=1)

            # a boxplot to show cell lines effects. This requires
            # the settings.analyse_type to be PANCAN
            if self.settings.analysis_type == 'PANCAN':
                boxplot.boxplot_pancan(fignum=2, mode='tissue')
            if self.settings.include_MSI_factor:
                boxplot.boxplot_pancan(fignum=3, mode='msi')

        results = {
            'FEATURE': feature_name,
            'DRUG_ID': drug_id,
            'DRUG_NAME': drug_name,
            'DRUG_TARGET': drug_target,
            'N_FEATURE_pos': odof.Npos,
            'N_FEATURE_neg': odof.Nneg,
            'FEATURE_pos_logIC50_MEAN': odof.pos_IC50_mean,
            'FEATURE_neg_logIC50_MEAN': odof.neg_IC50_mean,
            'FEATURE_delta_MEAN_IC50': odof.delta_mean_IC50,
            'FEATURE_pos_IC50_sd': odof.pos_IC50_std,
            'FEATURE_neg_IC50_sd': odof.neg_IC50_std,
            'FEATURE_IC50_effect_size': odof.effectsize_ic50,
            'FEATURE_pos_Glass_delta': odof.pos_glass,
            'FEATURE_neg_Glass_delta': odof.neg_glass,
            'ANOVA_FEATURE_pval': FEATURE_PVAL,
            'ANOVA_TISSUE_pval': tissue_PVAL,
            'ANOVA_MSI_pval': MSI_PVAL,
            'ANOVA_MEDIA_pval': MEDIA_PVAL,
            'FEATURE_IC50_T_pval': odof.ttest  # pvalues is in index 1
        }

        # 12% of the time here
        if production is True:
            return results
        else:
            df = pd.DataFrame(results, index=[1])
            return df