def feat_importance_linear(self): linear_list=[] if self._pandas_flag: le=LabelEncoder() X_train = self.data_frame.drop(self.target, axis=1) if self.problem_type !='REGRESSION': try: Y_train=le.fit_transform(self.data_frame[self.target]) except: Y_train = le.fit_transform(self.data_frame[self.target].astype(str)) else: Y_train = self.data_frame[self.target] X_train = X_train[X_train._get_numeric_data().columns] for c in list(X_train.columns): pearson_coef, p_value = stats.pearsonr(X_train[c],Y_train) if p_value < 0.05 : linear_list.append(c) else: if self.problem_type !='REGRESSION': indexer = StringIndexer(inputCol=self.target, outputCol="label") indexed = indexer.fit(self.data_frame).transform(self.data_frame) X_train = indexed.drop('label') num_var = [i[0] for i in X_train.dtypes if ((i[1]=='int') | (i[1]=='double'))] num_of_samples = indexed.select(num_var[0]).count() for column_one in num_var: corr = indexed.corr(column_one, 'label') # num_of_samples = indexed.select(column_one).count() df = num_of_samples - 2 std_error = math.sqrt(old_div((1 - math.pow(corr, 2)), df)) t_value = old_div(corr, std_error) p_value = Stats.t_distribution_critical_value(t_value, df=df) if p_value < 0.05 : linear_list.append(column_one) else: X_train = self.data_frame.drop(self.target) num_var = [i[0] for i in X_train.dtypes if ((i[1]=='int') | (i[1]=='double'))] for column_one in num_var: corr = self.data_frame.corr(column_one, self.target) num_of_samples = self.data_frame.select(column_one).count() df = num_of_samples - 2 std_error = math.sqrt(old_div((1 - math.pow(corr, 2)), df)) t_value = old_div(corr, std_error) p_value = Stats.t_distribution_critical_value(t_value, df=df) if p_value < 0.05 : linear_list.append(column_one) self.data_change_dict['SelectedColsLinear'] = linear_list linear_list.append(self.target) return linear_list
def _ttest_with_different_sample_variances(self, sample1, sample2, sample1_variance, sample2_variance): # Welch's t-test sample1_size = sample1.count() sample2_size = sample2.count() sample1_mean = Stats.mean(sample1, self._dependent_var) sample2_mean = Stats.mean(sample2, self._dependent_var) degrees_of_freedom = (math.pow( (sample1_variance / sample1_size) + (sample2_variance / sample2_size), 2)) / ( (math.pow(sample1_variance, 2) / (math.pow(sample1_size, 2) * (sample1_size - 1))) + (math.pow(sample2_variance, 2) / (math.pow(sample2_size, 2) * (sample2_size - 1)))) t_value = (sample1_mean - sample2_mean) / math.sqrt( (sample1_variance / sample1_size) + (sample2_variance / sample2_size)) p_value = Stats.t_distribution_critical_value(t_value, df=degrees_of_freedom) return IndependentSampleTTestResult( indep_variable=self._independent_var, dep_variable=self._dependent_var, sample1_level=self._independent_var_levels[0], sample1_mean=sample1_mean, sample1_variance=sample1_variance, sample2_level=self._independent_var_levels[1], sample2_mean=sample2_mean, sample2_variance=sample2_variance, t_value=t_value, p_value=p_value, df=degrees_of_freedom)
def _ttest_unequal_size_samples_with_same_variance(self, sample1, sample2, sample1_variance, sample2_variance): sample1_size = sample1.count() sample2_size = sample2.count() sample1_mean = Stats.mean(sample1, self._dependent_var) sample2_mean = Stats.mean(sample2, self._dependent_var) degrees_of_freedom = sample1_size + sample2_size - 2 pooled_std_dev = math.sqrt( ((sample1_size - 1) * sample1_variance + (sample2_size - 1) * sample2_variance) / degrees_of_freedom) std_err = pooled_std_dev * math.sqrt((1 / sample1_size) + (1 / sample2_size)) t_value = (sample1_mean - sample2_mean) / std_err p_value = Stats.t_distribution_critical_value(t_value, df=degrees_of_freedom) return IndependentSampleTTestResult( indep_variable=self._independent_var, dep_variable=self._dependent_var, sample1_level=self._independent_var_levels[0], sample1_mean=sample1_mean, sample1_variance=sample1_variance, sample2_level=self._independent_var_levels[1], sample2_mean=sample2_mean, sample2_variance=sample2_variance, t_value=t_value, p_value=p_value, df=degrees_of_freedom)
def _ttest_equal_size_samples_with_same_variance(self, sample_size, sample1, sample2, sample1_variance, sample2_variance): sample1_mean = Stats.mean(sample1, self._dependent_var) sample2_mean = Stats.mean(sample2, self._dependent_var) pooled_standard_deviation = math.sqrt( (sample1_variance + sample2_variance) / 2) standard_error = pooled_standard_deviation * math.sqrt( 2.0 / sample_size) t_value = (sample1_mean - sample2_mean) / standard_error degrees_of_freedom = 2 * sample_size - 2 p_value = Stats.t_distribution_critical_value(t_value, df=degrees_of_freedom) return IndependentSampleTTestResult( indep_variable=self._independent_var, dep_variable=self._dependent_var, sample1_level=self._independent_var_levels[0], sample1_mean=sample1_mean, sample1_variance=sample1_variance, sample2_level=self._independent_var_levels[1], sample2_mean=sample2_mean, sample2_variance=sample2_variance, t_value=t_value, p_value=p_value, df=degrees_of_freedom)
def _corr(self, column_one, column_two): """ Finds correlation between two columns, also calculates a) statistical significance info b) effect size info - coefficient of determination, and c) confidence intervals. :param column_one: :param column_two: :return: """ corr = self._data_frame.corr(column_one, column_two) num_of_samples = self._data_frame.select(column_one).count() df = num_of_samples - 2 std_error = math.sqrt((1 - math.pow(corr, 2)) / df) t_value = corr / std_error p_value = Stats.t_distribution_critical_value(t_value, df=df) coeff_determination = math.pow(corr, 2) corr_stats = CorrelationStats(correlation=corr, std_error=std_error, t_value=t_value, p_value=p_value, degrees_of_freedom=df, coeff_determination=coeff_determination) for alpha in ALPHA_LEVELS: (lower_bound, upper_bound) = self._confidence_interval(corr, num_of_samples, alpha) corr_stats.set_confidence_interval(alpha, lower_bound, upper_bound) return corr_stats
def test(self): column1 = FN.col(self._column1) column2 = FN.col(self._column2) diff_column_name = 'diff' diff_expr = (column2 - column1).alias(diff_column_name) sample_of_differences = self._data_frame.select(diff_expr) sample_size = sample_of_differences.count() sample_mean = Stats.mean(sample_of_differences, diff_column_name) sample_sd = Stats.standard_deviation(sample_of_differences, diff_column_name) t_value = float(sample_mean) / (old_div(sample_sd, math.sqrt(sample_size))) degree_of_freedom = sample_size - 1 p_value = Stats.t_distribution_critical_value(t_value, df=degree_of_freedom) return DependentSampleTtestResult(column1=self._column1, column2=self._column2, sample_size=sample_size, mean_of_differences=sample_mean, df=degree_of_freedom, t_value=t_value, p_value=p_value)