예제 #1
0
def detect_label_outliers(y, encoder=None):
    # remove outliers
    # TODO per-class outlier removal also

    # if an encoder is provided, use it to invert the encoding before computing stats.
    if encoder:
        y = encoder.inverse_transform(y)

    # perform outlier removal on mean/var-normalized samples (after filling missing data).
    y_std = RobustScaler().fit_transform(
        Imputer(strategy="median").fit_transform(
            y.reshape([-1, 1])
        )
    ).reshape([-1])
    mask = (y_std < y_std.mean() - 3*y_std.std()) | (y_std > y_std.mean() + 3*y_std.std())
    logger.info("Filtering {}/{} as outliers.".format(mask.sum(), len(mask)))
    return mask
예제 #2
0
    X_scale=scale(X)
    
elif c==2:#线性归一化
    X_scale=minmax_scale(X)
    
elif c==0:#不进行归一化
    X_scale=X
    
elif c==3:#鲁棒性归一化
    from sklearn.preprocessing import RobustScaler
    X_scale=RobustScaler().fit_transform(X)

print 'the standar result of X is:',X_scale
##测试X_scale,正常情况下均值为0,方差为1
#1.
print 'mean=',X_scale.mean()
print 'std=',X_scale.std()
#2.
print 'min=',X_scale.min()
print 'max=',X_scale.max()
csv_file1.close()

##为了理解方便、表示方法简单
X=X_scale

##归一化之后的统计信息
##获得X的统计信息
statistics(X)
##频率分布图
#drawHist(X,'AOD','Frequency','the Frequency of standar AOD')
##频率累计图