示例#1
0
    def feat_importance_linear(self):
        linear_list=[]
        if self._pandas_flag:
            le=LabelEncoder()
            X_train = self.data_frame.drop(self.target, axis=1)
            if self.problem_type !='REGRESSION':
                try:
                    Y_train=le.fit_transform(self.data_frame[self.target])
                except:
                    Y_train = le.fit_transform(self.data_frame[self.target].astype(str))
            else:
                Y_train = self.data_frame[self.target]
            X_train = X_train[X_train._get_numeric_data().columns]

            for c in list(X_train.columns):
                pearson_coef, p_value = stats.pearsonr(X_train[c],Y_train)
                if p_value < 0.05 :
                    linear_list.append(c)
        else:
            if self.problem_type !='REGRESSION':
                indexer = StringIndexer(inputCol=self.target, outputCol="label")
                indexed = indexer.fit(self.data_frame).transform(self.data_frame)
                X_train = indexed.drop('label')
                num_var = [i[0] for i in X_train.dtypes if ((i[1]=='int') | (i[1]=='double'))]
                num_of_samples = indexed.select(num_var[0]).count()
                for column_one in num_var:
                    corr = indexed.corr(column_one, 'label')
                    # num_of_samples = indexed.select(column_one).count()
                    df = num_of_samples - 2
                    std_error = math.sqrt(old_div((1 - math.pow(corr, 2)), df))
                    t_value = old_div(corr, std_error)
                    p_value = Stats.t_distribution_critical_value(t_value, df=df)
                    if p_value < 0.05 :
                        linear_list.append(column_one)
            else:
                X_train = self.data_frame.drop(self.target)
                num_var = [i[0] for i in X_train.dtypes if ((i[1]=='int') | (i[1]=='double'))]
                for column_one in num_var:
                    corr = self.data_frame.corr(column_one, self.target)
                    num_of_samples = self.data_frame.select(column_one).count()
                    df = num_of_samples - 2
                    std_error = math.sqrt(old_div((1 - math.pow(corr, 2)), df))
                    t_value = old_div(corr, std_error)
                    p_value = Stats.t_distribution_critical_value(t_value, df=df)
                    if p_value < 0.05 :
                        linear_list.append(column_one)
        self.data_change_dict['SelectedColsLinear'] = linear_list
        linear_list.append(self.target)
        return linear_list
示例#2
0
    def _ttest_with_different_sample_variances(self, sample1, sample2,
                                               sample1_variance,
                                               sample2_variance):
        # Welch's t-test
        sample1_size = sample1.count()
        sample2_size = sample2.count()
        sample1_mean = Stats.mean(sample1, self._dependent_var)
        sample2_mean = Stats.mean(sample2, self._dependent_var)
        degrees_of_freedom = (math.pow(
            (sample1_variance / sample1_size) +
            (sample2_variance / sample2_size), 2)) / (
                (math.pow(sample1_variance, 2) / (math.pow(sample1_size, 2) *
                                                  (sample1_size - 1))) +
                (math.pow(sample2_variance, 2) / (math.pow(sample2_size, 2) *
                                                  (sample2_size - 1))))
        t_value = (sample1_mean - sample2_mean) / math.sqrt(
            (sample1_variance / sample1_size) +
            (sample2_variance / sample2_size))
        p_value = Stats.t_distribution_critical_value(t_value,
                                                      df=degrees_of_freedom)

        return IndependentSampleTTestResult(
            indep_variable=self._independent_var,
            dep_variable=self._dependent_var,
            sample1_level=self._independent_var_levels[0],
            sample1_mean=sample1_mean,
            sample1_variance=sample1_variance,
            sample2_level=self._independent_var_levels[1],
            sample2_mean=sample2_mean,
            sample2_variance=sample2_variance,
            t_value=t_value,
            p_value=p_value,
            df=degrees_of_freedom)
示例#3
0
    def _ttest_unequal_size_samples_with_same_variance(self, sample1, sample2,
                                                       sample1_variance,
                                                       sample2_variance):
        sample1_size = sample1.count()
        sample2_size = sample2.count()
        sample1_mean = Stats.mean(sample1, self._dependent_var)
        sample2_mean = Stats.mean(sample2, self._dependent_var)
        degrees_of_freedom = sample1_size + sample2_size - 2
        pooled_std_dev = math.sqrt(
            ((sample1_size - 1) * sample1_variance +
             (sample2_size - 1) * sample2_variance) / degrees_of_freedom)
        std_err = pooled_std_dev * math.sqrt((1 / sample1_size) +
                                             (1 / sample2_size))
        t_value = (sample1_mean - sample2_mean) / std_err
        p_value = Stats.t_distribution_critical_value(t_value,
                                                      df=degrees_of_freedom)

        return IndependentSampleTTestResult(
            indep_variable=self._independent_var,
            dep_variable=self._dependent_var,
            sample1_level=self._independent_var_levels[0],
            sample1_mean=sample1_mean,
            sample1_variance=sample1_variance,
            sample2_level=self._independent_var_levels[1],
            sample2_mean=sample2_mean,
            sample2_variance=sample2_variance,
            t_value=t_value,
            p_value=p_value,
            df=degrees_of_freedom)
示例#4
0
    def _ttest_equal_size_samples_with_same_variance(self, sample_size,
                                                     sample1, sample2,
                                                     sample1_variance,
                                                     sample2_variance):
        sample1_mean = Stats.mean(sample1, self._dependent_var)
        sample2_mean = Stats.mean(sample2, self._dependent_var)
        pooled_standard_deviation = math.sqrt(
            (sample1_variance + sample2_variance) / 2)
        standard_error = pooled_standard_deviation * math.sqrt(
            2.0 / sample_size)
        t_value = (sample1_mean - sample2_mean) / standard_error
        degrees_of_freedom = 2 * sample_size - 2
        p_value = Stats.t_distribution_critical_value(t_value,
                                                      df=degrees_of_freedom)

        return IndependentSampleTTestResult(
            indep_variable=self._independent_var,
            dep_variable=self._dependent_var,
            sample1_level=self._independent_var_levels[0],
            sample1_mean=sample1_mean,
            sample1_variance=sample1_variance,
            sample2_level=self._independent_var_levels[1],
            sample2_mean=sample2_mean,
            sample2_variance=sample2_variance,
            t_value=t_value,
            p_value=p_value,
            df=degrees_of_freedom)
示例#5
0
    def _corr(self, column_one, column_two):
        """
        Finds correlation between two columns, also calculates
            a) statistical significance info
            b) effect size info - coefficient of determination, and
            c) confidence intervals.

        :param column_one:
        :param column_two:
        :return:
        """
        corr = self._data_frame.corr(column_one, column_two)
        num_of_samples = self._data_frame.select(column_one).count()
        df = num_of_samples - 2
        std_error = math.sqrt((1 - math.pow(corr, 2)) / df)
        t_value = corr / std_error
        p_value = Stats.t_distribution_critical_value(t_value, df=df)
        coeff_determination = math.pow(corr, 2)

        corr_stats = CorrelationStats(correlation=corr, std_error=std_error, t_value=t_value, p_value=p_value,
                                      degrees_of_freedom=df, coeff_determination=coeff_determination)
        for alpha in ALPHA_LEVELS:
            (lower_bound, upper_bound) = self._confidence_interval(corr, num_of_samples, alpha)
            corr_stats.set_confidence_interval(alpha, lower_bound, upper_bound)

        return corr_stats
示例#6
0
    def test(self):
        column1 = FN.col(self._column1)
        column2 = FN.col(self._column2)
        diff_column_name = 'diff'
        diff_expr = (column2 - column1).alias(diff_column_name)
        sample_of_differences = self._data_frame.select(diff_expr)
        sample_size = sample_of_differences.count()
        sample_mean = Stats.mean(sample_of_differences, diff_column_name)
        sample_sd = Stats.standard_deviation(sample_of_differences, diff_column_name)
        t_value = float(sample_mean) / (old_div(sample_sd, math.sqrt(sample_size)))
        degree_of_freedom = sample_size - 1
        p_value = Stats.t_distribution_critical_value(t_value, df=degree_of_freedom)

        return DependentSampleTtestResult(column1=self._column1, column2=self._column2, sample_size=sample_size,
                                          mean_of_differences=sample_mean, df=degree_of_freedom, t_value=t_value,
                                          p_value=p_value)