예제 #1
0
def outlier_dates(cy, alp, all_pollutants_df, pol_list):
    '''
    cy = county
    alp = alpha value for outlier function
    '''
    from outliers import smirnov_grubbs as grubbs

    county_pollutants_df = county_pollutants(cy, all_pollutants_df, pol_list)

    pollutant_info = {
        'co': ['ppm', 'Carbon monoxide'],
        'no2': ['ppb', 'Nitrogen dioxide (NO2)'],
        'ozone': ['ppm', 'Ozone'],
        'pb': ['ug/m3', 'Lead'],
        'pm2_5': ['ug/m3', 'PM2.5'],
        'pm10': ['ug/m3', 'PM10'],
        'so2': ['ppb', 'Sulfur dioxide']
    }

    for p in pol_list:
        outliers = grubbs.max_test_outliers(list(county_pollutants_df[p]),
                                            alpha=alp)

        #When did this happen?
        if len(outliers) != 0:
            d = str(county_pollutants_df[county_pollutants_df[p] ==
                                         outliers[0]].index[0])
            print('The %s %s outlier occured on %s' %
                  (cy, pollutant_info[p][1], d[0:10]))
예제 #2
0
def getStatistics(list):
    df = pd.DataFrame(list)
    statsm = {}
    mean, var, std = stats.bayes_mvs(df, alpha=0.95)
    if math.isnan(mean[0]):
        statsm['mean'] = 0
    else:
        statsm['mean'] = int(mean[0])

    if math.isnan(var[0]):
        statsm['var'] = 0
    else:
        statsm['var'] = int(var[0])
    
    if math.isnan(std[0]):
        statsm['std'] = 0
    else:
        statsm['std'] = int(std[0])

    out = grubbs.max_test_outliers(list, alpha=0.05)

    if out:
        statsm['outlier'] = numpy.amax(out)
    else:
        statsm['outlier'] = 0
    return statsm
예제 #3
0
def test_grubbs(rv, N=200, alpha=0.05, N_reps=1000):
    n_fa = 0
    val_fa = []
    for rep in range(N_reps):
        outliers = grubbs.max_test_outliers(rv.rvs(size=N), alpha=alpha)
        if len(outliers) > 0:
            val_fa += outliers
            n_fa += 1

    return n_fa / N_reps, val_fa
예제 #4
0
def outlier_test(values):
    '''
    Inputs: Absorbance to be used in outlier test 
    Outputs: If outlier exists, will print "outlier exists" statment + value that is deemed an outlier. If no outlier exists, "No outlier" statement is printed.
    '''
    relstdev = 100 * (np.std(values) / np.mean(values))
    if (relstdev > 10):
        outlier = grubbs.max_test_outliers(values, alpha=.05)
        return print('Outlier exists:', outlier)

    else:
        print('No outlier')
예제 #5
0
def grubbs_cal(inputList, significance=0.05):
    result = grubbs.max_test_outliers(inputList, alpha=significance)
    print(result)
예제 #6
0
#Check if age is missing at random
print(pd.crosstab(train.loc[train.Age.isnull()]['Survived'], train.loc[train.Age.isnull()]['Pclass'], rownames=["Rows Missing Age by Survived and Pclass"]))

#-------------------------------------------------------------------------------------------------#
#---------------------------------Dealing with Outliers-------------------------------------------#
#-------------------------------------------------------------------------------------------------#

#Boxplots for numeric variables
numeric_cols = [col for col in train.columns if train[col].dtype == 'float64']
for col in numeric_cols:
    sns.boxplot(y=train[~np.isnan(train[col])][col])
    plt.title("Box Plot for " + col)
    plt.show()

#Grubbs test (note - this is the generalized extreme studentized deviates test/iterative Grubbs)
print(train.loc[train.Fare == grubbs.max_test_outliers(train['Fare'], alpha=0.05)[0]])

#Compute fare per person, since some passengers bought group tickets producing fares that are sums of the individual ticket prices
train['Set'] = 'train'
test['Set'] = 'test'
alldata = pd.concat([train.drop(['Survived'], axis=1), test], ignore_index=True)
alldata['Group_Size'] = alldata.groupby(['Fare', 'Ticket'])['PassengerId'].transform("count")
alldata['Fare_Per_Person'] = alldata.Fare/alldata.Group_Size

#Plot fare by passenger class
sns.boxplot(y=alldata[alldata.Pclass==1]['Fare_Per_Person'].values)
plt.title("Box Plot for Fare Per Person - First Class")
plt.show()
sns.boxplot(y=alldata[alldata.Pclass==2]['Fare_Per_Person'].values)
plt.title("Box Plot for Fare Per Person - Second Class")
plt.show()
    def test_one_sided_max_outlier_detection(self):
        outliers = grubbs.max_test_outliers(self.rvs, alpha=self.default_alpha)

        self.assertIn(self.rvs.max(), outliers)
        self.assertNotIn(self.rvs.min(), outliers)
예제 #8
0
 def get_max_outliers(self,alpha=0.05):
     data = pd.Series(self.window)
     result = grubbs.max_test_outliers(data, alpha=alpha)
     return result
예제 #9
0
    def test_one_sided_max_outlier_detection(self):
        outliers = grubbs.max_test_outliers(self.rvs, alpha=self.default_alpha)

        self.assertIn(self.rvs.max(), outliers)
        self.assertNotIn(self.rvs.min(), outliers)
예제 #10
0
def outliersTest(list):
    return grubbs.max_test_outliers(list, alpha=0.05)
예제 #11
0
pval_thr = 0.00001

nd_regions = pd.read_csv(f, sep=',')
ok = nd_regions.type[~nd_regions['Mean.Rho.bp.'].isnull()].value_counts()
ok = ok[ok >= 10].index
filter_data = nd_regions[nd_regions.type.isin(ok)].copy()

# fill na
mmin = filter_data.NucDiv.min()
filter_data.NucDiv.fillna(mmin, inplace=True)

################# outliers by region ##################
#found outliers by region
gb = filter_data.groupby(['type'])['Mean.Rho.bp.']
outliers_by_region = gb.apply(
    lambda x: grubbs.max_test_outliers(x.dropna(), alpha=pval_thr))

outliersByReg = []
for ty in outliers_by_region.index:
    ii = filter_data[(filter_data.type == ty) & (
        filter_data['Mean.Rho.bp.'].isin(outliers_by_region[ty]))].index.values
    if len(ii) > 0:
        outliersByReg.append(ii[0])

outlier_data = filter_data.loc[outliersByReg, ]

filter_data = filter_data.drop(outliersByReg)

mm = filter_data.groupby(['type'])[[
    'NucDiv', 'GC', 'Mean.Rho.bp.', u'X.95..CI', u'X.95..CI.1', 'length'
]].mean()