Пример #1
0
def clean_outliers(data, thresh):
    '''
    (by Alejandro N |uacute| |ntilde| ez)
    
    Cleans a data from outliers by replacing them with numpy nans. A point *x* is identified as an outlier if \| *x* - *med* \| / *MAD* > *thresh*, where *med* is the median of the data values and *MAD* is the median absolute deviation, defined as 1.482 * median(\| *x* - *med* \|).
    
    This function mimics IDL mc_findoutliers (by Mike Cushing), with output differences.
    
    *data*
      Array with data values.
    *thresh*
      The sigma threshold that defines data outliers.
    '''
    # Check inputs
    try:
        data[0]
    except TypeError:
        print('Data invalid.')
        return
    
    # Calculate median and median absolute deviation
    med = sps.nanmedian(data)
    mad = 1.482 * sps.nanmedian(abs(data-med))
    
    dataClean = np.array(data).copy()
    if mad == 0:
        print('MAD is equal to zero.')
    else:
        outlierIdx = np.where(abs((dataClean - med) / mad) > thresh)
        if len(outlierIdx) != 0:
            dataClean[outlierIdx] = np.nan
    
    return dataClean
Пример #2
0
def clean_outliers(data, thresh):
    '''
    (by Alejandro N |uacute| |ntilde| ez)
    
    Cleans a data from outliers by replacing them with numpy nans. A point *x* is identified as an outlier if \| *x* - *med* \| / *MAD* > *thresh*, where *med* is the median of the data values and *MAD* is the median absolute deviation, defined as 1.482 * median(\| *x* - *med* \|).
    
    This function mimics IDL mc_findoutliers (by Mike Cushing), with output differences.
    
    *data*
      Array with data values.
    *thresh*
      The sigma threshold that defines data outliers.
    '''
    # Check inputs
    try:
        data[0]
    except TypeError:
        print('Data invalid.')
        return

    # Calculate median and median absolute deviation
    med = sps.nanmedian(data)
    mad = 1.482 * sps.nanmedian(abs(data - med))

    dataClean = np.array(data).copy()
    if mad == 0:
        print('MAD is equal to zero.')
    else:
        outlierIdx = np.where(abs((dataClean - med) / mad) > thresh)
        if len(outlierIdx) != 0:
            dataClean[outlierIdx] = np.nan

    return dataClean
Пример #3
0
    def adaptiveMedianFilt(dat, kernel_size):
        """
        Perform a median filter with a sliding window. For edges, we shrink window.
        """
        assert kernel_size % 2 == 1, 'kernel_size must be odd'

        nobs = dat.size
        filt_data = np.nan * np.ones_like(dat)

        # Beginning region
        halfWindow = 0
        for i in range(kernel_size // 2):
            filt_data[i] = nanmedian(dat[i - halfWindow:i + halfWindow + 1])
            halfWindow += 1

        # Middle region
        halfWindow = kernel_size // 2
        for i in range(halfWindow, nobs - halfWindow):
            filt_data[i] = nanmedian(dat[i - halfWindow:i + halfWindow + 1])

        # Ending region
        halfWindow -= 1
        for i in range(nobs - halfWindow, nobs):
            filt_data[i] = nanmedian(dat[i - halfWindow:i + halfWindow + 1])
            halfWindow -= 1

        return filt_data
Пример #4
0
def calc_norm_summary_tables(accuracy_tbl, time_tbl):
    """
    Calculate normalized performance/ranking summary, as numpy
    matrices as usual for convenience, and matrices of additional
    statistics (min, max, percentiles, etc.)

    Here normalized means relative to the best which gets a 1, all
    others get the ratio resulting from dividing by the performance of
    the best.
    """
    # Min across all minimizers, i.e. for each fit problem what is the lowest chi-squared and the lowest time
    min_sum_err_sq = np.nanmin(accuracy_tbl, 1)
    min_runtime = np.nanmin(time_tbl, 1)

    # create normalised tables
    norm_acc_rankings = accuracy_tbl / min_sum_err_sq[:, None]
    norm_runtimes = time_tbl / min_runtime[:, None]

    summary_cells_acc = np.array([np.nanmin(norm_acc_rankings, 0),
                                  np.nanmax(norm_acc_rankings, 0),
                                  stats.nanmean(norm_acc_rankings, 0),
                                  stats.nanmedian(norm_acc_rankings, 0)
                                  ])

    summary_cells_runtime = np.array([np.nanmin(norm_runtimes, 0),
                                      np.nanmax(norm_runtimes, 0),
                                      stats.nanmean(norm_runtimes, 0),
                                      stats.nanmedian(norm_runtimes, 0)
                                      ])

    return norm_acc_rankings, norm_runtimes, summary_cells_acc, summary_cells_runtime
Пример #5
0
def plotForce():
    figure(size=3,aspect=0.5)
    subplot(1,2,1)
    from EvalTraj import plotFF
    plotFF(vp=351,t=28,f=900,cm=0.6,foffset=8)
    subplot_annotate()
    
    subplot(1,2,2)
    for i in [1,2,3,4]:
        R=np.squeeze(np.load('Rdpse%d.npy'%i))
        R=stats.nanmedian(R,axis=2)[:,1:,:]
        dps=np.linspace(-1,1,201)[1:]
        plt.plot(dps,R[:,:,2].mean(0));
    plt.legend([0,0.1,0.2,0.3],loc=3) 
    i=2
    R=np.squeeze(np.load('Rdpse%d.npy'%i))
    R=stats.nanmedian(R,axis=2)[:,1:,:]
    mn=np.argmin(R,axis=1)
    y=np.random.randn(mn.shape[0])*0.00002+0.0438
    plt.plot(np.sort(dps[mn[:,2]]),y,'+',mew=1,ms=6,mec=[ 0.39  ,  0.76,  0.64])
    plt.xlabel('Displacement of Force Origin')
    plt.ylabel('Average Net Force Magnitude')
    hh=dps[mn[:,2]]
    err=np.std(hh)/np.sqrt(hh.shape[0])*stats.t.ppf(0.975,hh.shape[0])
    err2=np.std(hh)/np.sqrt(hh.shape[0])*stats.t.ppf(0.75,hh.shape[0])
    m=np.mean(hh)
    print m, m-err,m+err
    np.save('force',[m, m-err,m+err,m-err2,m+err2])
    plt.xlim([-0.5,0.5])
    plt.ylim([0.0435,0.046])
    plt.grid(b=True,axis='x')
    subplot_annotate()
Пример #6
0
 def normalization_shift_rows_or_cols(self, axis, stacker, history):
     endshape = [-1, -1]
     endshape[axis] = 1
     if self.combine_replicates:
         all_plates = stacker(self.normalization_plate_values.keys())
         controls = (stacker([self.normalization_control_maps[pl] for pl, _ in self.normalization_plate_values.keys()]) != CONTROL_POPULATION)
         all_plates[controls] = np.nan
         # use conservative_nanmedian to avoid taking median of too few values
         offsets = fix_nans(conservative_nanmedian(all_plates, axis))
         if offsets is None:  # too many NaNs to use conservative_nanmedian, try again.
             offsets = fix_nans(nanmedian(all_plates, axis))
             assert offsets is not None, "Too many bad values to correct row/column"
         offsets = offsets.reshape(endshape)
         # shift offsets to zero-median to keep things identifiable
         offsets -= np.median(offsets)
         history += offsets
         return dict(((plate, repindex), values - offsets)
                     for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
     else:
         offsets = {}
         for repindex in range(self.num_replicates):
             rep_plates = stacker([v for (_, rep), v in self.normalization_plate_values.iteritems() if repindex == rep])
             controls = (stacker([self.normalization_control_maps[pl] for pl, rep in self.normalization_plate_values.keys() if repindex == rep]) != CONTROL_POPULATION)
             rep_plates[controls] = np.nan
             # use conservative_nanmedian to avoid taking median of too few values
             offsets[repindex] = fix_nans(conservative_nanmedian(rep_plates, axis))
             if offsets[repindex] is None:  # too many NaNs to use conservative_nanmedian, try again.
                 offsets[repindex] = fix_nans(nanmedian(rep_plates, axis))
                 assert offsets[repindex] is not None, "Too many bad values to correct row/column"
             offsets[repindex] = offsets[repindex].reshape(endshape)
             # shift offsets to zero-median to keep things identifiable
             offsets[repindex] -= np.median(offsets[repindex])
             history[repindex] += offsets[repindex]
         return dict(((plate, repindex), values - offsets[repindex])
                     for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
Пример #7
0
def robust_median(v,m=2.):
    print "Robust Method: \t\t\t Median-Distanz-Test"
    median = stats.nanmedian(v)
    dist_median = numpy.absolute(v - stats.nanmedian(v))    # absolute distanz der messdaten zum median
    ratio_dist_median = dist_median/median                       # verhaeltnis zwischen distanz und median, je groesser diese
                                                                # distanz umso wahrscheinlicher is der messwert ein aussreisser
    v=numpy.where(ratio_dist_median>=m,numpy.nan,v) if median else 0           # setzte alle aussreisser auf nan

    return v,ratio_dist_median
def return_speedup_stats(x, y):

    speedup_stats = {
        'ratio_of_the_means': stats.nanmean(x) / stats.nanmean(y),
        'ratio_of_the_medians': stats.nanmedian(x) / stats.nanmedian(y),
        'ratio_of_the_stddevs': stats.nanstd(x) / stats.nanstd(y),
        'ratio_max_to_min': np.amax(x) / np.amin(y),
        'ratio_min_to_max': np.amin(x) / np.amax(y)
    }
    return speedup_stats
def removeoutliers(inarray,stdcut=3.0):
    #bonehead outlier cut, stdcut is how many sigma, replace with nearest neighbor
    #first mark the bad numbers
    inarray[np.logical_not(np.isfinite(inarray))]=0.
    indexarray=np.arange(len(inarray))
    badi=indexarray[np.abs(inarray-nanmedian(inarray)) > stdcut*nanstd(inarray) ]
    goodi=indexarray[np.abs(inarray-nanmedian(inarray)) <= stdcut*nanstd(inarray) ]
    outarray=inarray
    for i in badi:
        outarray[i]=inarray[np.abs(goodi-i).argmin()]
    return outarray
Пример #10
0
 def normalization_shift_rows_or_cols(self, axis, stacker, history):
     endshape = [-1, -1]
     endshape[axis] = 1
     if self.combine_replicates:
         all_plates = stacker(self.normalization_plate_values.keys())
         controls = (stacker([
             self.normalization_control_maps[pl]
             for pl, _ in self.normalization_plate_values.keys()
         ]) != CONTROL_POPULATION)
         all_plates[controls] = np.nan
         # use conservative_nanmedian to avoid taking median of too few values
         offsets = fix_nans(conservative_nanmedian(all_plates, axis))
         if offsets is None:  # too many NaNs to use conservative_nanmedian, try again.
             offsets = fix_nans(nanmedian(all_plates, axis))
             assert offsets is not None, "Too many bad values to correct row/column"
         offsets = offsets.reshape(endshape)
         # shift offsets to zero-median to keep things identifiable
         offsets -= np.median(offsets)
         history += offsets
         return dict(
             ((plate, repindex), values - offsets)
             for ((plate, repindex),
                  values) in self.normalization_plate_values.iteritems())
     else:
         offsets = {}
         for repindex in range(self.num_replicates):
             rep_plates = stacker([
                 v for (
                     _,
                     rep), v in self.normalization_plate_values.iteritems()
                 if repindex == rep
             ])
             controls = (stacker([
                 self.normalization_control_maps[pl]
                 for pl, rep in self.normalization_plate_values.keys()
                 if repindex == rep
             ]) != CONTROL_POPULATION)
             rep_plates[controls] = np.nan
             # use conservative_nanmedian to avoid taking median of too few values
             offsets[repindex] = fix_nans(
                 conservative_nanmedian(rep_plates, axis))
             if offsets[
                     repindex] is None:  # too many NaNs to use conservative_nanmedian, try again.
                 offsets[repindex] = fix_nans(nanmedian(rep_plates, axis))
                 assert offsets[
                     repindex] is not None, "Too many bad values to correct row/column"
             offsets[repindex] = offsets[repindex].reshape(endshape)
             # shift offsets to zero-median to keep things identifiable
             offsets[repindex] -= np.median(offsets[repindex])
             history[repindex] += offsets[repindex]
         return dict(
             ((plate, repindex), values - offsets[repindex])
             for ((plate, repindex),
                  values) in self.normalization_plate_values.iteritems())
Пример #11
0
    def normalization_align_plates(self):
        offsets = {}
        for (plate,
             repindex), values in self.normalization_plate_values.iteritems():
            control_map = self.normalization_control_maps[plate]
            if self.alignment_method == ALIGN_POPULATION:
                align_values = values[control_map == CONTROL_POPULATION]
            elif self.alignment_method == ALIGN_EVERYTHING:
                align_values = values
            else:
                assert False, "Unknown normalization method: %s" % (
                    self.alignment_method)

            # XXX - should not shift plates that are more than half filled by controls
            # compute an offset per-plate and per-replicate
            if len(align_values) > 0:
                offsets[plate, repindex] = np.median(align_values)
            else:
                offsets[plate, repindex] = np.nan
        # shift offsets to zero-median to keep things identifiable
        if self.combine_replicates:
            # keep overall shift at 0
            global_shift = nanmedian(offsets.values())
            for plate, repindex in offsets:
                if np.isnan(offsets[plate, repindex]):
                    offsets[plate, repindex] = 0.0
                else:
                    offsets[plate, repindex] -= global_shift
                self.normalization_total_plate_shifts[
                    plate, repindex] += offsets[plate, repindex]
            return dict(
                ((plate, repindex), values - offsets[plate, repindex])
                for ((plate, repindex),
                     values) in self.normalization_plate_values.iteritems())
        else:
            replicate_indices = np.array([repindex for _, repindex in offsets])
            offset_vals = np.array(offsets.values())
            per_replicate_shifts = dict(
                (repindex,
                 nanmedian(offset_vals[replicate_indices == repindex]))
                for repindex in range(self.num_replicates))
            for plate, repindex in offsets:
                if np.isnan(offsets[plate, repindex]):
                    offsets[plate, repindex] = 0.0
                else:
                    offsets[plate, repindex] -= per_replicate_shifts[repindex]
                self.normalization_total_plate_shifts[
                    plate, repindex] += offsets[plate, repindex]
            return dict(
                ((plate, repindex), values - offsets[plate, repindex])
                for ((plate, repindex),
                     values) in self.normalization_plate_values.iteritems())
Пример #12
0
    def make_plots(self, num_bins=50):
        import matplotlib.pyplot as p

        ## Histogram of Widths
        widths = [float(x) for x in self.dataframe["Widths"] if is_float_try(x)]
        widths_stats = [nanmean(widths), nanstd(widths), nanmedian(widths)]

        ## Histogram of Lengths
        lengths = self.dataframe["Lengths"]
        lengths_stats = [nanmean(lengths), nanstd(lengths), nanmedian(lengths)]

        ## Histogram of Curvature
        rht_curvature = self.dataframe["RHT Curvature"]
        rht_curvature_stats = [nanmean(rht_curvature), nanstd(rht_curvature), nanmedian(rht_curvature)]



        if self.verbose:
            print "Widths Stats: %s" % (widths_stats)
            print "Lengths Stats: %s" % (lengths_stats)
            print "Curvature Stats: %s" % (rht_curvature_stats)

            p.subplot(131)
            p.hist(widths, num_bins)
            p.xlabel("Widths (pc)")
            p.subplot(132)
            p.hist(lengths, num_bins)
            p.xlabel("Lengths (pc)")
            p.subplot(133)
            p.hist(curvature, num_bins)
            p.xlabel("Curvature")
            p.show()
        if self.save:
            p.hist(widths, num_bins)
            p.xlabel("Widths (pc)")
            p.savefig("".join([self.save_name,"_widths.pdf"]))
            p.close()

            p.hist(lengths, num_bins)
            p.xlabel("Lengths (pc)")
            p.savefig("".join([self.save_name,"_lengths.pdf"]))
            p.close()

            p.hist(rht_curvature, num_bins)
            p.xlabel("RHT Curvature")
            p.savefig("".join([self.save_name,"_rht_curvature.pdf"]))
            p.close()

        return self
Пример #13
0
 def median_f(self, x):
     """Compute median over time varying axis of a front relative
     quantity, x.
     """
     # TODO: the axis used in nanmean is different for U and Uf
     # calcs - change Uf dims to make consistent?
     return stats.nanmedian(x, axis=1)
Пример #14
0
def calc_stats(a, maskzero=False):

    statsDict = {}
    a = np.array(a)
    if maskzero:
        a = np.where(np.equal(a, 0.0), np.nan, a)

    # Check that array is not all NaNs
    statsDict['npix'] = int(np.sum(np.where(np.isnan(a), 0.0, 1.0)))
    if statsDict['npix'] >= 2:
        statsDict['stdev'] = float(stats.nanstd(a.flatten()))
        statsDict['mean'] = float(stats.nanmean(a.flatten()))
        statsDict['median'] = float(stats.nanmedian(a.flatten()))
        statsDict['max'] = float(np.nanmax(a))
        statsDict['min'] = float(np.nanmin(a))
        statsDict['centmax'] = list(np.unravel_index(np.nanargmax(a), a.shape))
        statsDict['madfm'] = float(MAD(a.flatten()))
        statsDict['npix'] = int(np.sum(np.where(np.isnan(a), 0.0, 1.0)))
        statsDict['success'] = True

    else:
        statsDict['npix'] == 0
        statsDict['stdev'] = 0.0
        statsDict['mean'] = 0.0
        statsDict['median'] = 0.0
        statsDict['max'] = 0.0
        statsDict['min'] = 0.0
        statsDict['centmax'] = (0.0, 0.0)
        statsDict['madfm'] = 0.0
        statsDict['success'] = False

    return statsDict
Пример #15
0
 def median_f(self, x):
     """Compute median over time varying axis of a front relative
     quantity, x.
     """
     # TODO: the axis used in nanmean is different for U and Uf
     # calcs - change Uf dims to make consistent?
     return stats.nanmedian(x, axis=1)
Пример #16
0
def ComputeDensityChange(hpImage,lpImage,ws,ov):

    SizeZVol = hpImage.shape[0]
    SizeXVol = hpImage.shape[1]
    SizeYVol = hpImage.shape[2]

    hpImage = (254.0)/(1.0+np.exp(-(hpImage-200.0)/-100.0))+2.0
    lpImage = (254.0)/(1.0+np.exp(-(lpImage-200.0)/-100.0))+2.0

    BlocSpeed =  int((1-ov) * ws)
    if BlocSpeed == 0 :
        print("The overlap ration is to big the bloc matching speed is set to 1 pixel")
        BlocSpeed = 1

    densityMap = np.zeros((SizeZVol,SizeXVol,SizeYVol))
    iterationMap = np.zeros((SizeZVol,SizeXVol,SizeYVol))

    'Ready For Loop'

    for xRef in np.arange(0,SizeXVol -1 ,BlocSpeed):
        for yRef in np.arange(0,SizeYVol -1,BlocSpeed):
            for zRef in np.arange(0,SizeZVol -1,BlocSpeed):
                print xRef, yRef, zRef

                xMinRef = xRef-int(ws/2)
                yMinRef = yRef-int(ws/2)
                zMinRef = zRef-int(ws/2)

                xMaxRef = xRef+int(ws/2)
                yMaxRef = yRef+int(ws/2)
                zMaxRef = zRef+int(ws/2)

                if xMinRef < 0 :
                    xMinRef = 0
                if yMinRef < 0 :
                    yMinRef = 0
                if zMinRef < 0 :
                    zMinRef = 0

                if xMaxRef >= SizeXVol :
                    xMaxRef = SizeXVol - 1
                if yMaxRef >= SizeYVol :
                    yMaxRef = SizeYVol - 1
                if zMaxRef >= SizeZVol :
                    zMaxRef = SizeZVol - 1

                BlocRefHP = hpImage[zMinRef:zMaxRef,xMinRef:xMaxRef,yMinRef:yMaxRef]+1.0
                BlocRefLP = lpImage[zMinRef:zMaxRef,xMinRef:xMaxRef,yMinRef:yMaxRef]+1.0
                BlocRef = (BlocRefLP - BlocRefHP)

                densityMap[zMinRef:zMaxRef,xMinRef:xMaxRef,yMinRef:yMaxRef] +=  stats.nanmedian(BlocRef,axis=None)
                iterationMap[zMinRef:zMaxRef,xMinRef:xMaxRef,yMinRef:yMaxRef] += 1

    with np.errstate(divide='ignore', invalid='ignore'):
        c = np.true_divide(densityMap,iterationMap)
        c[c == np.inf] = 0
        c = np.nan_to_num(c)


    return c
Пример #17
0
    def get_munged_clean_data(self):
        train_df = self.datamart['train']

        Y_train_df = train_df['Target']
        Y_train = np.array(Y_train_df)
        del train_df['Target']

        test_df = self.datamart['test']

        assert np.all(train_df.columns == test_df.columns)
        X_train = np.array(train_df)
        del train_df
        X_test = np.array(test_df)
        del test_df
        X_train_nan = np.isnan(X_train)
        X_test_nan = np.isnan(X_test)
        X_train = np.hstack((X_train,X_train_nan))
        X_test = np.hstack((X_test,X_test_nan))
        X_train_median = stats.nanmedian(X_train,axis=0)
        for i in xrange(X_train.shape[1]):
            X_train[np.isnan(X_train[:,i]),i] = X_train_median[i]
        for i in xrange(X_test.shape[1]):
            X_test[np.isnan(X_test[:,i]),i] = X_train_median[i]
        keep_not_boring = X_train.std(axis=0) > 0.0
        X_train = X_train[:,keep_not_boring]
        X_test = X_test[:,keep_not_boring]
        return X_train, Y_train, X_test
Пример #18
0
def calc_stats_old(a, maskzero=False):
    """Calculate the statistics of an array"""
    
    statsDict = {}
    a = np.array(a)
    if maskzero:
        a = np.where( np.equal(a, 0.0), np.nan, a)

    # Check that array is not all NaNs
    statsDict['npix'] = int(np.sum(np.where(np.isnan(a),0.0,1.0)))
    if statsDict['npix']>=2:
        statsDict['stdev'] = float(stats.nanstd(a.flatten()))
        statsDict['mean'] = float(stats.nanmean(a.flatten()))
        statsDict['median'] = float(stats.nanmedian(a.flatten()))
        statsDict['max'] = float(np.nanmax(a))
        statsDict['min'] = float(np.nanmin(a))
        statsDict['centmax'] = list(np.unravel_index(np.nanargmax(a),
                                                     a.shape))
        statsDict['madfm'] = float(MAD(a.flatten()))
        statsDict['npix'] = int(np.sum(np.where(np.isnan(a),0.0,1.0)))
        statsDict['success'] = True
        
    else:
        statsDict['npix'] == 0
        statsDict['stdev']   = 0.0
        statsDict['mean']    = 0.0
        statsDict['median']  = 0.0
        statsDict['max']     = 0.0
        statsDict['min']     = 0.0
        statsDict['centmax'] = (0.0, 0.0)
        statsDict['madfm']   = 0.0
        statsDict['success'] = False
        
    return statsDict
Пример #19
0
def make_oiiewbins(zmin=_zmin, zmax=_zmax):
    """
    """
    nbin = 2 + 3 + 4 + 5
    oiiewmin = np.zeros(nbin)
    oiiewmax = np.zeros(nbin)
    oiiewbin = np.zeros(nbin)
    oiiewmin[0:2] = [_EPS, 50.0]
    oiiewmax[0:2] = [50.0, 200.]
    oiiewmin[2:2 + 3] = [_EPS, 40.0, 70.0]
    oiiewmax[2:2 + 3] = [40.0, 70.0, 200.]
    oiiewmin[5:5 + 4] = [_EPS, 30.0, 50.0, 80.0]
    oiiewmax[5:5 + 4] = [30.0, 50.0, 80.0, 200.]
    oiiewmin[9:9 + 5] = [_EPS, 25.0, 45.0, 60.0, 90.0]
    oiiewmax[9:9 + 5] = [25.0, 45.0, 60.0, 90.0, 200.]

    oiilummin = np.zeros(nbin)
    oiilummax = np.zeros(nbin)
    oiilumbin = np.zeros(nbin)
    oiilummin[0:2] = [40.0, 41.6]
    oiilummax[0:2] = [41.6, 43.5]
    oiilummin[2:2 + 3] = [40.0, 41.4, 41.8]
    oiilummax[2:2 + 3] = [41.4, 41.8, 43.5]
    oiilummin[5:5 + 4] = [40.0, 41.3, 41.6, 41.9]
    oiilummax[5:5 + 4] = [41.3, 41.6, 41.9, 43.5]
    oiilummin[9:9 + 5] = [40.0, 41.2, 41.5, 41.7, 42.0]
    oiilummax[9:9 + 5] = [41.2, 41.5, 41.7, 42.0, 43.5]

    # Calculate the medians
    objs_ori = elg_readin()
    vac_objs = elg_readin(vac=True)
    nobj = objs_ori.size
    zindex = (np.where(
        np.logical_and(
            np.logical_and(
                np.logical_and(objs_ori['zGOOD'] == 1, objs_ori['Z'] > zmin),
                objs_ori['Z'] < zmax), objs_ori['CLASS'] == 'GALAXY')))[0]
    oiiew = vac_objs['OIIEW'][zindex]
    logoiilum = np.log10(vac_objs['OIILUM'][zindex])

    for i in np.arange(nbin):
        oiiewbin[i] = nanmedian(oiiew[((oiiew > oiiewmin[i]) &
                                       (oiiew < oiiewmax[i]))])
        oiilumbin[i] = nanmedian(logoiilum[((logoiilum > oiilummin[i]) &
                                            (logoiilum < oiilummax[i]))])

    return (oiiewmin, oiiewmax, oiiewbin, oiilummin, oiilummax, oiilumbin)
Пример #20
0
def calc_clipped_stats_old(data, clip=3.0, nIter=10):
    """Calculate the mean and stdev of an array given a sigma clip"""
    
    data = np.array(data).flatten()
    
    mean = float(stats.nanmean(data))
    std = float(stats.nanstd(data))
    mad = float(MAD(data))

    if clip > 0.0:
        convergeFlg = 0
        itCnt = 0
        while convergeFlg==0 and itCnt<nIter:
            meanOld, stdOld, madOld = mean, std, mad
            minVal = mean - (clip * mad)
            maxVal = mean + (clip * mad)

            # Blank values outside the 3-sigma range
            dataMsk = np.where(np.greater(data, maxVal), np.nan, data)
            dataMsk = np.where(np.less(data, minVal), np.nan, dataMsk)

            # Measure the statistics
            mean = stats.nanmean(dataMsk)
            median = stats.nanmedian(dataMsk)
            std = stats.nanstd(dataMsk)
            mad = MAD(dataMsk)
            npix = np.sum(np.where(np.isnan(dataMsk),0.0,1.0))
            dataMsk = []
            
            if mean == meanOld and mad == madOld:
                convergFlg = 1
            itCnt += 1
            

    # Assemble the measurements into a dictionary
    m = {}
    m['mean'] = float(mean)
    m['median'] = float(median)
    m['stdev'] = float(std)
    m['madfm'] = float(mad)
    m['npix'] =int(npix)
    m['max'] = float(np.nanmax(data))
    m['min'] = float(np.nanmin(data))
    del data
    
    # If all nans
    if m['npix'] == 0:
        m['stdev'] = 0.0
        m['mean'] = 0.0
        m['median'] = 0.0
        m['max'] = 0.0
        m['min'] = 0.0
        m['centmax'] = (0.0,0.0)
        m['madfm'] = 0.0
        m['success'] = False
    else:
        m['success'] = True

    return m
Пример #21
0
    def summary(self):
        """
        return summary statistics for the dataset
        """
        r="RT (all): Min=%.2f, Max=%.2f, Mean=%.2f, Median=%.2f\n"%(np.nanmin(self.RT),
                                                                    np.nanmax(self.RT),
                                                                    stats.nanmean(self.RT),
                                                                    stats.nanmedian(self.RT))
        for cond in range(self.design.nconditions()):
            r+="RT ({cond}): Min={minrt}, Max={maxrt}, Mean={meanrt}, Median={medrt}\n".format(
                cond=":".join(self.design.condidx(cond)),
                minrt=np.nanmin(self.RT[self.condition==cond]),
                maxrt=np.nanmax(self.RT[self.condition==cond]),
                meanrt=stats.nanmean(self.RT[self.condition==cond]),
                medrt=stats.nanmedian(self.RT[self.condition==cond]))

        r+='errors (all GO): {nerr}/{ntrials} ({errperc:.2f} %)\n'.format(
            nerr=np.sum(np.logical_not(self.correct[np.isnan(self.SSD)])),
            ntrials=len(self.correct[np.isnan(self.SSD)]),
            errperc=np.sum(np.logical_not(self.correct[np.isnan(self.SSD)]))/float(len(self.correct[np.isnan(self.SSD)])))
        for cond in range(self.design.nconditions()):
            r+='errors ({cond}): {nerr}/{ntrials} ({errperc:.2f} %)\n'.format(
                cond=":".join(self.design.condidx(cond)),
                nerr=np.sum(np.logical_not(self.correct[(self.condition==cond) & np.isnan(self.SSD)])),
                ntrials=len(self.correct[(self.condition==cond) & np.isnan(self.SSD)]),
                errperc=np.sum(np.logical_not(self.correct[(self.condition==cond) & np.isnan(self.SSD)]))
                               /float(len(self.correct[(self.condition==cond) & np.isnan(self.SSD)])))
                
            
        r+='miss GO (all): {nmiss}/{ntrials} ({missperc:.2f} %)\n'.format(
            nmiss=np.sum(np.isnan(self.RT[np.isnan(self.SSD)])),
            ntrials=self.ntrials,
            missperc=100.*np.sum(np.isnan(self.RT[np.isnan(self.SSD)]))/float(self.ntrials)
            )
        for cond in range(self.design.nconditions()):
            r+="miss GO ({cond}): {nmiss}/{ntrials} ({missperc:.2f} %)\n".format(
                cond=":".join(self.design.condidx(cond)),
                ntrials=len(self.RT[self.condition==cond]),
                missperc=100.*np.sum(np.isnan(self.RT[(self.condition==cond) & np.isnan(self.SSD)]))/float(self.ntrials),
                nmiss=np.sum(np.isnan(self.RT[(self.condition==cond) & (np.isnan(self.SSD))])))

        r+="SSD-distribution\n"
        a=stats.itemfreq(self.SSD[np.isfinite(self.SSD)])#.astype(np.int)
        r+= " NUM | "+" ".join(["%7i"%int(i) for i in (a[:,1])]) + "\n"
        r+= " SSD | "+" ".join(["%7.2f"%(i) for i in (a[:,0])]) +"\n"            
        return r
Пример #22
0
    def buildtable(self):
        """
        builds the table of stars
        """
        import numpy as np

        epochs = len(self.objids)
        stars = len(self.stars)
        if fileexists('/work2/jwe/NGC2281/' + self.filter + 'array.npy'):
            m = np.load('/work2/jwe/NGC2281/' + self.filter + 'array.npy')
        else:
            from datasource import DataSource
            from framecal import FrameCal

            fc = FrameCal(self.filter)

            m = np.zeros([epochs, stars])
            # objid is specific to a filter so we only need to query the objid
            wifsip = DataSource(host='pina', database='wifsip', user='******')
            for objid in self.objids:
                k = self.objids.index(objid)
                print k, epochs, objid,
                query = """SELECT matched.id, phot.mag_auto, phot.mag_errauto 
                        FROM phot, matched
                        WHERE phot.objid like '%s'
                        AND (matched.objid,matched.star) = (phot.objid,phot.star)
                        AND phot.flags = 0;""" % objid
                result = wifsip.query(query)
                starids = [s[0] for s in result]
                mags = [s[1] for s in result]
                err = [s[2] for s in result]

                slope, intercept, _, _, _ = fc.calframe(objid)
                print len(mags)
                for starid in starids:
                    i = self.stars.index(starid)
                    m[k, i] = mags[starids.index(starid)] * slope + intercept
            np.save('/work2/jwe/NGC2281/' + self.filter + 'array.npy', m)
            wifsip.close()

        i = np.where(m == 0.0)
        m[i] = np.nan
        from scipy import stats
        # calculate the observed average for the stars
        avg = stats.nanmean(m, axis=0)
        for k in range(epochs):
            print k, epochs, self.objids[k]

            # calculate the mean of offsets
            off = stats.nanmedian(m[k, :] - avg)
            # correct epoch for mean of offsets
            m[k, :] += off

        # calculate new corrected means
        avg = stats.nanmean(m, axis=0)
        std = stats.nanstd(m, axis=0)
        for i in range(len(self.stars)):
            print self.stars[i], avg[i], std[i]
Пример #23
0
def calc_clipped_stats_old(data, clip=3.0, nIter=10):

    data = np.array(data).flatten()

    mean = float(stats.nanmean(data))
    std = float(stats.nanstd(data))
    mad = float(MAD(data))

    if clip > 0.0:
        convergeFlg = 0
        itCnt = 0
        while convergeFlg == 0 and itCnt < nIter:
            meanOld, stdOld, madOld = mean, std, mad
            minVal = mean - (clip * mad)
            maxVal = mean + (clip * mad)

            # Blank values outside the 3-sigma range
            dataMsk = np.where(np.greater(data, maxVal), np.nan, data)
            dataMsk = np.where(np.less(data, minVal), np.nan, dataMsk)

            # Measure the statistics
            mean = stats.nanmean(dataMsk)
            median = stats.nanmedian(dataMsk)
            std = stats.nanstd(dataMsk)
            mad = MAD(dataMsk)
            npix = np.sum(np.where(np.isnan(dataMsk), 0.0, 1.0))
            dataMsk = []

            if mean == meanOld and mad == madOld:
                convergFlg = 1
            itCnt += 1

    # Assemble the measurements into a dictionary
    m = {}
    m['mean'] = float(mean)
    m['median'] = float(median)
    m['stdev'] = float(std)
    m['madfm'] = float(mad)
    m['npix'] = int(npix)
    m['max'] = float(np.nanmax(data))
    m['min'] = float(np.nanmin(data))
    del data

    # If all nans
    if m['npix'] == 0:
        m['stdev'] = 0.0
        m['mean'] = 0.0
        m['median'] = 0.0
        m['max'] = 0.0
        m['min'] = 0.0
        m['centmax'] = (0.0, 0.0)
        m['madfm'] = 0.0
        m['success'] = False
    else:
        m['success'] = True

    return m
Пример #24
0
def mgii_composite_jackknife():
    (master_wave, rest_allflux, rest_allivar) = rest_allspec_readin()
    master_loglam = np.log10(master_wave)
    # mask out useless wavelength ranges
    # left 2300
    wave_pos = np.array([2200.])
    rest_loc = np.searchsorted(master_wave, wave_pos)
    rest_allivar[0:rest_loc[0],:] = 0.
    # Fe II 2350
    wave_pos = np.array([2330., 2420])
    rest_loc = np.searchsorted(master_wave, wave_pos)
    rest_allivar[rest_loc[0]:rest_loc[1],:] = 0.
    # Fe II 2600
    wave_pos = np.array([2570., 2640])
    rest_loc = np.searchsorted(master_wave, wave_pos)
    rest_allivar[rest_loc[0]:rest_loc[1],:] = 0.
    # Mg II 2800
    wave_pos = np.array([2770., 2820])
    rest_loc = np.searchsorted(master_wave, wave_pos)
    rest_allivar[rest_loc[0]:rest_loc[1],:] = 0.
    # Mg I 2853
    wave_pos = np.array([2843., 2863])
    rest_loc = np.searchsorted(master_wave, wave_pos)
    rest_allivar[rest_loc[0]:rest_loc[1],:] = 0.
    # right 2900
    wave_pos = np.array([2900.])
    rest_loc = np.searchsorted(master_wave, wave_pos)
    rest_allivar[rest_loc[0]:,:] = 0.

    normalized_rest_allflux = rest_allflux
    for i in np.arange((rest_allflux.shape)[1]):
        imask = (np.where(rest_allivar[:,i]>0.))[0]
        if imask.size>0: 
           x = np.log10(master_wave[imask])
           y = rest_allflux[imask, i]
           z = np.polyfit(x, y, 3)
           p = np.poly1d(z)
           continuum = p(master_loglam)
           normalized_rest_allflux[:,i] = rest_allflux[:,i]/continuum
    
    wave_pos = np.array([2200., 2900.])
    rest_loc = np.searchsorted(master_wave, wave_pos)

    outwave = master_wave[rest_loc[0]:rest_loc[1]]
    #tmp_fluxmean = nanmean(normalized_rest_allflux[rest_loc[0]:rest_loc[1], _mgii_index], 1)
    #tmp_fluxmedian = nanmedian(normalized_rest_allflux[rest_loc[0]:rest_loc[1], _mgii_index], 1)
    fluxused = normalized_rest_allflux[rest_loc[0]:rest_loc[1], _mgii_index]
    njack = len(_mgii_index)
    nwave = outwave.size
    fluxmean = np.zeros((nwave, njack))
    fluxmedian = np.zeros((nwave, njack))
    for ijack in np.arange(njack):
        fluxmean[:,ijack] = nanmean(normalized_rest_allflux[rest_loc[0]:rest_loc[1], np.r_[_mgii_index[:ijack], _mgii_index[ijack+1:]]], 1)
        fluxmedian[:,ijack] = nanmedian(normalized_rest_allflux[rest_loc[0]:rest_loc[1], np.r_[_mgii_index[:ijack], _mgii_index[ijack+1:]]], 1)
    #tmp_fluxmedian = nanmedian(normalized_rest_allflux[rest_loc[0]:rest_loc[1], _mgii_index], 1)

    return (outwave, fluxmean, fluxmedian, fluxused)
Пример #25
0
def fivenum(v):
    v = np.array(v)
    try:
        np.sum(v)
    except TypeError:
        print('Error: you must provide a list or array of only numbers')
    q1 = scoreatpercentile(v[~np.isnan(v)],25)
    q3 = scoreatpercentile(v[~np.isnan(v)],75)
    md = nanmedian(v)
    return np.nanmin(v), q1, md, q3, np.nanmax(v),
Пример #26
0
def aggregate_ftr_matrix(ftr_matrix):
    sig = []
    for ftr in ftr_matrix:
        median = stats.nanmedian(ftr)
        mean = stats.nanmean(ftr)
        std = stats.nanstd(ftr)
        # Invalid double scalars warning appears here
        skew = stats.skew(ftr) if any(ftr) else 0.0
        kurtosis = stats.kurtosis(ftr)
        sig.extend([median, mean, std, skew, kurtosis])
    return sig
Пример #27
0
    def normalization_align_plates(self):
        offsets = {}
        for (plate, repindex), values in self.normalization_plate_values.iteritems():
            control_map = self.normalization_control_maps[plate]
            if self.alignment_method == ALIGN_POPULATION:
                align_values = values[control_map == CONTROL_POPULATION]
            elif self.alignment_method == ALIGN_EVERYTHING:
                align_values = values
            else:
                assert False, "Unknown normalization method: %s"%(self.alignment_method)

            # XXX - should not shift plates that are more than half filled by controls
            # compute an offset per-plate and per-replicate
            if len(align_values) > 0:
                offsets[plate, repindex] = np.median(align_values)
            else:
                offsets[plate, repindex] = np.nan
        # shift offsets to zero-median to keep things identifiable
        if self.combine_replicates:
            # keep overall shift at 0
            global_shift = nanmedian(offsets.values())
            for plate, repindex in offsets:
                if np.isnan(offsets[plate, repindex]):
                    offsets[plate, repindex] = 0.0
                else:
                    offsets[plate, repindex] -= global_shift
                self.normalization_total_plate_shifts[plate, repindex] += offsets[plate, repindex]
            return dict(((plate, repindex), values - offsets[plate, repindex])
                        for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
        else:
            replicate_indices = np.array([repindex for _, repindex in offsets])
            offset_vals = np.array(offsets.values())
            per_replicate_shifts = dict((repindex, nanmedian(offset_vals[replicate_indices == repindex])) for repindex in range(self.num_replicates))
            for plate, repindex in offsets:
                if np.isnan(offsets[plate, repindex]):
                    offsets[plate, repindex] = 0.0
                else:
                    offsets[plate, repindex] -= per_replicate_shifts[repindex]
                self.normalization_total_plate_shifts[plate, repindex] += offsets[plate, repindex]
            return dict(((plate, repindex), values - offsets[plate, repindex])
                        for ((plate, repindex), values) in self.normalization_plate_values.iteritems())
Пример #28
0
def calc_summary_table(minimizers, group_results):
    """
    Calculates a summary from problem-individual results. At the moment the only summary
    statistic calculated is the median. The output is produced as numpy matrices.

    @param minimizers :: list of minimizers used (their names)

    @param group_results :: results from running fitting tests on different problems (list
    of lists, where the first level is the group, and the second level is the individual test).


    @returns two numpy matrices (where columns are the groups, and rows are the minimizers)
    with summary statistic (median) from the problem-individual results.
    """

    num_groups = len(group_results)
    num_minimizers = len(minimizers)

    groups_norm_acc = np.zeros((num_groups, num_minimizers))
    groups_norm_runtime = np.zeros((num_groups, num_minimizers))
    for group_idx, results_per_test in enumerate(group_results):
        num_tests = len(results_per_test)
        accuracy_tbl = np.zeros((num_tests, num_minimizers))
        time_tbl = np.zeros((num_tests, num_minimizers))

        for test_idx in range(0, num_tests):
            for minimiz_idx in range(0, num_minimizers):
                accuracy_tbl[test_idx, minimiz_idx] = results_per_test[test_idx][minimiz_idx].sum_err_sq
                time_tbl[test_idx, minimiz_idx] = results_per_test[test_idx][minimiz_idx].runtime

        # Min across all alternative runs/minimizers
        min_sum_err_sq = np.nanmin(accuracy_tbl, 1)
        min_runtime = np.nanmin(time_tbl, 1)

        norm_acc_rankings = accuracy_tbl / min_sum_err_sq[:, None]
        norm_runtime_rankings = time_tbl / min_runtime[:, None]

        groups_norm_acc[group_idx, :] = stats.nanmedian(norm_acc_rankings, 0)
        groups_norm_runtime[group_idx, :] = stats.nanmedian(norm_runtime_rankings, 0)

    return groups_norm_acc, groups_norm_runtime
Пример #29
0
def fivenum(v):
    """Returns Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum) for the input vector, a list or array of numbers based on 1.5 times the interquartile distance"""
    try:
        numpy.sum(v)
    except TypeError:
        print('Error: you must provide a list or array of only numbers')
    q1 = scoreatpercentile(v,25)
    q3 = scoreatpercentile(v,75)
    iqd = q3-q1
    md = nanmedian(v)
    whisker = 1.5*iqd
    return numpy.nanmin(v), md-whisker, md, md+whisker, numpy.nanmax(v),
Пример #30
0
def read_data(filename):
    data = pd.read_table(filename, sep=',', warn_bad_lines=True, error_bad_lines=True)
    data = np.asarray(data.values, dtype = float)
    col_mean = stats.nanmedian(data,axis = 0)
    inds = np.where(np.isnan(data))
    data[inds] = np.take(col_mean,inds[1])
    #data=[np.concatenate((np.array([data[:,1]]).T,data[:,6:]),axis=1)]
    X_train = data[:,6: ]
    Y_train = data[:,1:6]
    svm_x = map(lambda xr: { i+1: xr[i] for i in range(xr.shape[0]) if not np.isnan(xr[i]) } , X_train )
    svm_y_ary = map( lambda i : [ y for y in Y_train[:,i]], range(Y_train.shape[1]) )
    return svm_x, svm_y_ary
 def get_no_nan_median(self):
     tmp = 1*self.C
     tmp[tmp == 127] = np.nan
     cur_med = stats.nanmedian(tmp,axis=1)
     
     if np.sum(np.isnan(cur_med)) > 0:
         cur_med_is_not_nan_idx = np.logical_not(np.isnan(cur_med))
         self.prev_med[cur_med_is_not_nan_idx] = cur_med[cur_med_is_not_nan_idx]
     else:
         self.prev_med = cur_med
     
     return 1*self.prev_med
Пример #32
0
def binMean(X,Y,numBins=8,xmin=None,xmax=None):
    if xmin is None:
        xmin = X.min()
    if xmax is None:
        xmax = X.max()
    bins = np.linspace(xmin,xmax,numBins+1)
#    print bins,Y

    YY = np.array([nanmean(Y[(X > bins[binInd]) & (X <= bins[binInd+1])]) for binInd in range(numBins)])
    YYmedian = np.array([nanmedian(Y[(X > bins[binInd]) & (X <= bins[binInd+1])]) for binInd in range(numBins)])
    YYstd = np.array([np.std(Y[(X > bins[binInd]) & (X <= bins[binInd+1])]) for binInd in range(numBins)])
    return bins[:-1]+(bins[1]-bins[0])*0.5,YY,YYmedian,YYstd
Пример #33
0
    def median(self,
               files=[],
               bands=[1],
               doReproject=True,
               maskName='mask',
               **kwargs):
        '''Calculate median of input bands

        Memory and CPU greedy method. Generates 3D cube from bands of
        all input images and calculates median. Adds median bands to self

        Parameters
        -----------
        files : list
            list of input files
        bands : list
            list of names/band_numbers to be processed
        doReproject : boolean, [True]
            reproject input files?
        maskName : str, ['mask']
            name of the mask in input files
        nClass : child of Nansat, [Nansat]
            This class is used to read input files
        eResampleAlg : int, [0]
            agorithm for reprojection, see Nansat.reproject()
        period : [datetime0, datetime1]
            Start and stop datetime objects from pyhon datetime.

        '''
        # check inputs
        if len(files) == 0:
            self.logger.error('No input files given!')
            return

        # modify default values
        self.bandIDs = bands
        self.doReproject = doReproject
        self.maskName = maskName
        self._set_defaults(kwargs)

        lastN = self._get_layer_image(files[-1])
        # add medians of all bands
        for band in bands:
            bandCube, mask = self._get_cube(files, band)
            bandMedian = st.nanmedian(bandCube, axis=0)

            # get metadata of this band from the last image
            parameters = lastN.get_metadata(bandID=band)
            # add band and std with metadata
            self.add_band(array=bandMedian, parameters=parameters)

        self.add_band(array=mask, parameters={'name': 'mask'})
Пример #34
0
def na_median(X):
    ''' returns a copy of X with NAs
    replaced by the median of the non NAs
    for each column
    '''
    col_median = nanmedian(X,axis=0)
    a=np.copy(X)
    inds = np.where(np.isnan(a))
    import sys
    sys.stderr.write(str(inds)+'\n')
    if inds[0].shape[0]>0:
        a[inds]=col_median
    return a
Пример #35
0
def main():
    dat=pd.read_table('data/train_v2.csv',sep=',')
    print "reading done, train"
    loss=np.asarray(dat.loss)
    dat=dat.drop(['loss','id'],1)
    dat['new1']=dat['f528']-dat['f527'] #golden feature 1
    dat['new2']=dat['f528']-dat['f274'] #golden feature 2
    dat=np.asarray(dat.values, dtype=float)
    col_med = stats.nanmedian(dat,axis=0)
    print "calculated medians, train"
    inds = np.where(np.isnan(dat))
    dat[inds]=np.take(col_med,inds[1])
    print "median imputation done, train"
    scaler=preprocessing.Scaler().fit(dat)
    dat=scaler.transform(dat)
    print "scaling done, train"
    labels=(loss>0).astype(int)
    np.save('data/x_train.npy',dat)
    np.save('data/y_train.npy',labels)
    np.save('data/loss.npy',loss)
    print "trainset done"
    
    dat=pd.read_table('data/test_v2.csv',sep=',')
    print "reading done, test"
    ids=np.asarray(dat.id)
    dat=dat.drop(['id'],1)
    dat['new1']=dat['f528']-dat['f527'] #golden feature 1
    dat['new2']=dat['f528']-dat['f274'] #golden feature 2
    dat=np.asarray(dat.values,dtype=float)
    col_med=stats.nanmedian(dat,axis=0)
    print "calculated medians, test"
    inds=np.where(np.isnan(dat))
    dat[inds]=np.take(col_med,inds[1])
    print "imputation done, test"
    dat=scaler.transform(dat)
    print "scaling done, test"
    np.save('data/x_test.npy',dat)
    np.save('data/ids.npy',ids)
    print "testset done"
Пример #36
0
def main():
    dat = pd.read_table('data/train_v2.csv', sep=',')
    print "reading done, train"
    loss = np.asarray(dat.loss)
    dat = dat.drop(['loss', 'id'], 1)
    dat['new1'] = dat['f528'] - dat['f527']  #golden feature 1
    dat['new2'] = dat['f528'] - dat['f274']  #golden feature 2
    dat = np.asarray(dat.values, dtype=float)
    col_med = stats.nanmedian(dat, axis=0)
    print "calculated medians, train"
    inds = np.where(np.isnan(dat))
    dat[inds] = np.take(col_med, inds[1])
    print "median imputation done, train"
    scaler = preprocessing.Scaler().fit(dat)
    dat = scaler.transform(dat)
    print "scaling done, train"
    labels = (loss > 0).astype(int)
    np.save('data/x_train.npy', dat)
    np.save('data/y_train.npy', labels)
    np.save('data/loss.npy', loss)
    print "trainset done"

    dat = pd.read_table('data/test_v2.csv', sep=',')
    print "reading done, test"
    ids = np.asarray(dat.id)
    dat = dat.drop(['id'], 1)
    dat['new1'] = dat['f528'] - dat['f527']  #golden feature 1
    dat['new2'] = dat['f528'] - dat['f274']  #golden feature 2
    dat = np.asarray(dat.values, dtype=float)
    col_med = stats.nanmedian(dat, axis=0)
    print "calculated medians, test"
    inds = np.where(np.isnan(dat))
    dat[inds] = np.take(col_med, inds[1])
    print "imputation done, test"
    dat = scaler.transform(dat)
    print "scaling done, test"
    np.save('data/x_test.npy', dat)
    np.save('data/ids.npy', ids)
    print "testset done"
 def getAnnulusCounts(self, im, annulusInner, annulusOuter, center):
     startpx = int(np.round(center[0]))
     startpy = int(np.round(center[1]))
     innerMask = aperture(startpx, startpy, annulusInner)
     outerMask = aperture(startpx, startpy, annulusOuter)
     annulusMask = outerMask-innerMask
     nanMask = np.isnan(im)
     annulusPixels =  np.array(np.where(np.logical_and(annulusMask==1, nanMask==False)))
     nAnnPix = annulusPixels.shape[1]
     annulusCounts = nanmedian(im[annulusPixels[0],annulusPixels[1]])*nAnnPix
     if self.verbose:
         print "Annulus Counts = ", annulusCounts
         print "Annulus pixels = ", nAnnPix
     return [annulusCounts, nAnnPix]
Пример #38
0
def sdize_vector( vec, ignore_zeroes=True, use_median=True ): ## note this is inplace! If don't want, pass vec.copy() !!
    v = vec
    if ignore_zeroes:
        v = vec[ vec != 0 ]
    if use_median:
        from scipy.stats import nanmedian
        mn = nanmedian(v)
        sd = mad(v)
    else:
        mn = np.nanmean( v )
        sd = np.nanstd( v )
    vec -= mn
    vec /= (sd + 0.001) ## try to minimize copies?
    return vec
Пример #39
0
def plotAgdist():
    ''' plot average distance of the pursued agents'''
    dist,discard,the,rest=computeTrackInfo()
    del discard,the,rest
    plt.figure(0,figsize=(10,8))
    for vp in range(1,5):
        xlim=500
        ys=dist[vp-1]
        dat=np.zeros((len(ys),int(HZ*xlim/1000.0),2))*np.nan
        datrev=np.zeros((len(ys),int(HZ*500/1000.0),2))*np.nan
        #datN=np.zeros((len(ys),xlim/20))
        for i in range(len(ys)):
            ao=np.argsort(map(np.median,ys[i]))
            if len(ys[i])==0:continue
            N=ys[i][ao[0]].size
            if N==0:continue
            dat[i,:min(dat.shape[1],N),0]=ys[i][ao[0]][:min(dat.shape[1],N)]
            datrev[i,-min(datrev.shape[1],N):,0]=ys[i][ao[0]][-min(datrev.shape[1],N):]
            N=ys[i][ao[-1]].size
            dat[i,:min(dat.shape[1],N),1]=ys[i][ao[-1]][:min(dat.shape[1],N)]
            datrev[i,-min(datrev.shape[1],N):,1]=ys[i][ao[-1]][-min(datrev.shape[1],N):]
        nrags=np.array(map(len,ys))
        ylims=[[[1,2.5]]*3,[[],[3,4],[3,5]]]
        for a in range(3)[::-1]:
            if a==2: sel=nrags>=(a+1)
            else: sel = nrags==(a+1)
            for i in range(2):
                if a==0 and i==1:continue
                plt.subplot(4,4,i*8+vp);plt.grid(b=False);#plt.ylim(ylims[i][a])
                plt.plot(np.linspace(0,xlim/1000.,dat.shape[1]),nanmedian(dat[sel,:,i],0));
                plt.subplot(4,4,i*8+vp+4);plt.grid(b=False);#plt.ylim(ylims[i][a])
                ss=datrev.shape[1]/HZ
                plt.plot(np.linspace(-ss,0,datrev.shape[1]),nanmedian(datrev[sel,:,i],0));
    plt.subplot(441)
    plt.legend(['> 2','2','1'],loc=4)
    initVP(1,1)
    plt.savefig(figpath+'trackAgdist')
Пример #40
0
def medsubtract(image,outname):
    data = pyfits.open(image)[0].data.copy()
    if data.ndim==3:
        data = data[0].copy()

    tmp = data.copy()
    tmp[numpy.isnan(tmp)] = 0.
    tmp -= numpy.sort(tmp,0)[tmp.shape[0]/5]
    trace = tmp.sum(1)
    peak = trace.argmax()

    center = numpy.empty(data.shape[1])
    w = center.copy()
    for i in range(1+center.size/100):
       b = i*100
       e = b+100
       if e>center.size:
           e = center.size
       if b==e:
           continue
       center[b:e] = tmp[:,b:e].sum(1).argmax()
    bg = center.copy()
    x = numpy.arange(data.shape[0])
    for i in range(center.size):
        d = tmp[:,i].copy()
        peak = center[i]
        if numpy.isnan(d[peak]):
            center[i] = peak
            continue
        fit = numpy.array([0.,d[peak],peak,1.])
        cond = ~numpy.isnan(d)
        input = numpy.empty((d[cond].size,2))
        input[:,0] = x[cond].copy()
        input[:,1] = d[cond].copy()
        fit,chi = sf.ngaussfit(input,fit)
        center[i] = fit[2]
        w[i] = fit[3]

    fit = sf.lsqfit(ndimage.median_filter(center,17),'polynomial',5)
    centroid = sf.genfunc(numpy.arange(bg.size),0.,fit)
    w = numpy.median(w)
    for i in range(bg.size):
        d = data[:,i].copy()
        d[centroid[i]-w*4:centroid[i]+w*4] = numpy.nan
        data[:,i] -= stats.nanmedian(d)

    hdu = pyfits.open(image)[0]
    hdu.data = data.copy()
    hdu.writeto(outname,clobber=True)
Пример #41
0
    def medianres(self, res, wrap=2*np.pi):
        
        ncyc = self.cycs.shape[0]
        nwid = self.data.shape[2]
        nlen = self.data.shape[1]
        logger.info('Analyzing %d cycles for unwrapping errors'% (ncyc))
        numcheck = np.zeros(self.nslice, dtype=np.int)
        numcycper = np.zeros(self.nslice, dtype=np.int)
        
        progb = tsio.ProgressBar(maxValue=self.nslice)
        for kkk in range(self.nslice):
            cycind = np.flatnonzero(self.cycs[:, 0] == (kkk + 1))
            numcycper[kkk] = len(cycind)
            orig = self.data[kkk, :, :]
            resarr = np.zeros((numcycper[kkk], nlen, nwid), dtype=np.int)
            
            for img in range(numcycper[kkk]):
                ind = cycind[img]
                sgn1 = np.sign(self.cycs[ind, 1])
                ifg1 = np.abs(self.cycs[ind, 1]) - 1
            
                sgn2 = np.sign(self.cycs[ind, 2])
                ifg2 = np.sign(self.cycs[ind, 2]) - 1
            
                p11 = self.data[ifg1, :, :]
                p22 = self.data[ifg2, :, :]
            
                recons = sgn1 * p11 + sgn2 * p22
                derr = orig - recons
                #refph = st.nanmedian(derr.flatten())
                #derr = derr - refph
                
                resarr[img, :, :] = (np.round(derr / wrap)).astype(np.int)
                
            medres = st.nanmedian(resarr, axis=0)
            #idict = {}
            #idict['orig'] = orig
            #idict['medres'] = medres/wrap
            #plots.imagemany(idict,show=True)
            
            res.data[kkk, :, :] = medres
            numcheck[kkk] = np.nansum(np.abs(medres) > 0)
            
            progb.update(kkk, every=3)
            
        progb.close()

        self.cyccount = numcycper
        self.check = numcheck
Пример #42
0
    def medianres(self, res, wrap=2 * np.pi):

        ncyc = self.cycs.shape[0]
        nwid = self.data.shape[2]
        nlen = self.data.shape[1]
        logger.info('Analyzing %d cycles for unwrapping errors' % (ncyc))
        numcheck = np.zeros(self.nslice, dtype=np.int)
        numcycper = np.zeros(self.nslice, dtype=np.int)

        progb = tsio.ProgressBar(maxValue=self.nslice)
        for kkk in range(self.nslice):
            cycind = np.flatnonzero(self.cycs[:, 0] == (kkk + 1))
            numcycper[kkk] = len(cycind)
            orig = self.data[kkk, :, :]
            resarr = np.zeros((numcycper[kkk], nlen, nwid), dtype=np.int)

            for img in range(numcycper[kkk]):
                ind = cycind[img]
                sgn1 = np.sign(self.cycs[ind, 1])
                ifg1 = np.abs(self.cycs[ind, 1]) - 1

                sgn2 = np.sign(self.cycs[ind, 2])
                ifg2 = np.sign(self.cycs[ind, 2]) - 1

                p11 = self.data[ifg1, :, :]
                p22 = self.data[ifg2, :, :]

                recons = sgn1 * p11 + sgn2 * p22
                derr = orig - recons
                #refph = st.nanmedian(derr.flatten())
                #derr = derr - refph

                resarr[img, :, :] = (np.round(derr / wrap)).astype(np.int)

            medres = st.nanmedian(resarr, axis=0)
            #idict = {}
            #idict['orig'] = orig
            #idict['medres'] = medres/wrap
            #plots.imagemany(idict,show=True)

            res.data[kkk, :, :] = medres
            numcheck[kkk] = np.nansum(np.abs(medres) > 0)

            progb.update(kkk, every=3)

        progb.close()

        self.cyccount = numcycper
        self.check = numcheck
Пример #43
0
    def median(self, files=[], bands=[1], doReproject=True, maskName='mask',
               **kwargs):
        '''Calculate median of input bands

        Memory and CPU greedy method. Generates 3D cube from bands of
        all input images and calculates median. Adds median bands to self

        Parameters
        -----------
        files : list
            list of input files
        bands : list
            list of names/band_numbers to be processed
        doReproject : boolean, [True]
            reproject input files?
        maskName : str, ['mask']
            name of the mask in input files
        nClass : child of Nansat, [Nansat]
            This class is used to read input files
        eResampleAlg : int, [0]
            agorithm for reprojection, see Nansat.reproject()
        period : [datetime0, datetime1]
            Start and stop datetime objects from pyhon datetime.

        '''
        # check inputs
        if len(files) == 0:
            self.logger.error('No input files given!')
            return

        # modify default values
        self.bandIDs = bands
        self.doReproject = doReproject
        self.maskName = maskName
        self._set_defaults(kwargs)

        lastN = self._get_layer_image(files[-1])
        # add medians of all bands
        for band in bands:
            bandCube, mask = self._get_cube(files, band)
            bandMedian = st.nanmedian(bandCube, axis=0)

            # get metadata of this band from the last image
            parameters = lastN.get_metadata(bandID=band)
            # add band and std with metadata
            self.add_band(array=bandMedian, parameters=parameters)

        self.add_band(array=mask, parameters={'name': 'mask'})
Пример #44
0
def sdize_vector(vec,
                 ignore_zeroes=True,
                 use_median=True
                 ):  ## note this is inplace! If don't want, pass vec.copy() !!
    v = vec
    if ignore_zeroes:
        v = vec[vec != 0]
    if use_median:
        from scipy.stats import nanmedian
        mn = nanmedian(v)
        sd = mad(v)
    else:
        mn = np.nanmean(v)
        sd = np.nanstd(v)
    vec -= mn
    vec /= (sd + 0.001)  ## try to minimize copies?
    return vec
Пример #45
0
    def median(self, files=[], bands=[1], doReproject=True, maskName='mask',
                opener=Nansat, eResampleAlg=0, period=(None, None),
                vmin=-np.inf, vmax=np.inf):
        '''Calculate median of input bands

        Memory and CPU greedy method. Generates 3D cube from bands of
        all input images and calculates median. Adds median bands to self

        Parameters
        -----------
        files : list
            list of input files
        bands : list
            list of names/band_numbers to be processed
        doReproject : boolean, [True]
            reproject input files?
        maskName : str, ['mask']
            name of the mask in input files
        nClass : child of Nansat, [Nansat]
            This class is used to read input files
        eResampleAlg : int, [0]
            agorithm for reprojection, see Nansat.reproject()
        period : [datetime0, datetime1]
            Start and stop datetime objects from pyhon datetime.

        '''
        # check inputs
        if len(files) == 0:
            self.logger.error('No input files given!')
            return

        # add medians of all bands
        for band in bands:
            cube, mask, metadata = self._get_cube(files, band,
                                                    doReproject,
                                                    maskName,
                                                    opener,
                                                    eResampleAlg,
                                                    period, vmin, vmax)
            median = st.nanmedian(cube, axis=0)

            # add band and std with metadata
            self.add_band(array=median, parameters=metadata)

        self.add_band(array=mask, parameters={'name': 'mask'})
Пример #46
0
    def _clean_nans(self, data):
        """
		Substitute NaNs with the median value of the related features

        Parameters
        ----------
        data : array, shape=[n_samples, n_features]
               Data array
        """
        r, c = np.isnan(data).nonzero()

        my = dict()
        for ic in np.unique(c):
            my[ic] = nanmedian(data[:, ic])

        for i in range(len(r)):
            data[r[i], c[i]] = my[c[i]]

        return data
Пример #47
0
def binMean(X, Y, numBins=8, xmin=None, xmax=None):
    if xmin is None:
        xmin = X.min()
    if xmax is None:
        xmax = X.max()
    bins = np.linspace(xmin, xmax, numBins + 1)
    #    print bins,Y

    YY = np.array([
        nanmean(Y[(X > bins[binInd]) & (X <= bins[binInd + 1])])
        for binInd in range(numBins)
    ])
    YYmedian = np.array([
        nanmedian(Y[(X > bins[binInd]) & (X <= bins[binInd + 1])])
        for binInd in range(numBins)
    ])
    YYstd = np.array([
        np.std(Y[(X > bins[binInd]) & (X <= bins[binInd + 1])])
        for binInd in range(numBins)
    ])
    return bins[:-1] + (bins[1] - bins[0]) * 0.5, YY, YYmedian, YYstd
Пример #48
0
def print_Description(merged_matrix):
    file_name = 'Descriptions.csv'

    with open(file_name, 'w') as f:
        f.write('Feature; nobs; min; max; mean; variance; skeweness; kurtosis; median; Q1; median; Q3')
        f.write('\n')

    for row in range(0, len(merged_matrix)):
        print('Feature: ' + str((row + 1)))
        with open(file_name, 'a') as f:
            line = 'Feature ' + str(row) + '; ' + \
                   str(stat.describe(merged_matrix[row])).replace('(', '').replace(')', '').replace(',',';') + \
                   ';' + str(stat.nanmedian(merged_matrix[row]))

            quantile_arr=list(stat.mstats.mquantiles(merged_matrix[row]))

            print(str(quantile_arr))

            line = line + ';' + '; '.join([str(quantile) for quantile in quantile_arr])
            print(line)
            line = line.replace('.', ',')
            f.write(line)
            f.write('\n')
Пример #49
0
def clean_spec_NaNs(flux):
    
    #fix initial nans on edges
    nanMap = np.isnan(flux)
    nanGroups, nNanGroups = label(nanMap)
#     leftEdgeIdx=0
#     rightEdgeIdx=len(flux)
    
#     plt.plot(nanMap)
#     plt.show()
    
#     nanMapIdx = np.where(nanMap==True) <<<<<make the next lines faster by using this
    if np.sum(nanMap)>0:
        print 'Found NaNs in flux array'
        
    for i,booI in enumerate(nanMap):
        if booI==False:
            leftEdgeIdx = i
            break
            
    for j,rbooI in enumerate(nanMap[::-1]):
        if rbooI==False:
            rightEdgeIdx = len(nanMap)-j
            break        

    fluxMedian = stats.nanmedian(flux)
    if leftEdgeIdx>0:
        flux[:leftEdgeIdx] = np.linspace(fluxMedian, flux[leftEdgeIdx+1],leftEdgeIdx)
    if rightEdgeIdx<len(flux):
        flux[rightEdgeIdx:] = np.linspace(flux[rightEdgeIdx-1], fluxMedian, len(flux)-rightEdgeIdx)

    nanMap = np.isnan(flux)        
    if np.sum(nanMap)>0:
        print 'NaNs remain in flux array'        

    plt.plot(nanMap)
    plt.show()
Пример #50
0
    def append_clean_nans(self):
        train_nan = np.isnan(self.training_x)
        train_median = stats.nanmedian(self.training_x)
        train_nan_locs = np.where(train_nan)
        ms, ns = train_nan_locs
        for m, n in zip(ms, ns):
            self.training_x.ix[m,n] = train_median[n]
        cols_to_keep = train_nan.sum(axis=0) != 0
        index_cols_to_keep = cols_to_keep.ix[np.where(cols_to_keep)].index
        self.train_dummy_nan = train_nan[index_cols_to_keep].astype(float)
        n_columns = []
        for i in self.train_dummy_nan.columns.tolist():
            i  = "nan_" + i
            n_columns.append(i)
        self.train_dummy_nan.columns = n_columns
        #self.training_x += self.train_dummy_nan

        test_nan = np.isnan(self.testing_x)
        test_nan_locs = np.where(test_nan)
        ms, ns = test_nan_locs
        for m, n in zip(ms, ns):
            self.testing_x.ix[m,n] = train_median[n]
        self.test_dummy_nan = test_nan[index_cols_to_keep].astype(float)
        self.test_dummy_nan.columns = n_columns
Пример #51
0
    def fitSky(self):
#        if not self.ui.checkBoxdoSkySub.isChecked():
#            self.sky1d = np.zeros(self.data2D.shape[1])
#            return
        #self.useSB1 = True
        #self.useSB2 = True
        #print self.imagetype
    
        b0 = np.floor(self.SB1_x0) ; b1 = np.ceil(self.SB1_x1)
        b2 = np.floor(self.SB2_x0) ; b3 = np.ceil(self.SB2_x1)
    
        sky01 = np.copy(self.data2D[b0:b1,:])
        sky23 = np.copy(self.data2D[b2:b3,:])

        if (self.pars['useSB1'] == False):
            sky01[:,:] = np.nan#sky01*0.0
        if (self.pars['useSB2'] == False):
            sky23[:,:] = np.nan#sky23*0.0
        
        sky03 = np.append(sky01,sky23,axis=0)        
        
#        print 'here'
#        print np.shape(sky03)
#        sky1d = np.median(sky03,0)
#        notNaN = np.where(np.isfinite(sky03))
#        sky1d = np.median(sky03[notNaN[0],notNaN[1]],0)
        sky1d = nanmedian(sky03,0)

        if np.sum(np.isnan(sky1d))==np.count_nonzero(sky1d) : # in case both sky bands are turned off, replace with zeros
            #print 'sky all nans'
            sky1d[:]=0.

        self.sky1d = sky1d 
#        print self.sky1d 

        """
Пример #52
0
 def rebin(self, field, shape):
     """Rebin field to a coarser matrix"""
     sh = shape[0],field.shape[0]//shape[0],shape[1],field.shape[1]//shape[1]
     return nanmedian(nanmedian(field.reshape(sh),axis=-1), axis=1)
Пример #53
0
    def make_hists(self, num_bins=None, use_prettyplotlib=True):

        if use_prettyplotlib:
            try:
                import prettyplotlib as plt
            except ImportError:
                import matplotlib.pyplot as plt
                use_prettyplotlib = False
                print "prettyplotlib not installed. Using matplotlib..."
        else:
            import matplotlib.pyplot as plt

        # Setup subplots if plotting together
        if self.subplot:
          num = len(self.columns)
          if num <= 3:
            ncols = 1
            nrows = num
          elif num <= 8:
            ncols = 2
            nrows = num / 2
          else:  # Max columns right now is 12
            ncols = 3
            nrows = num / 3
          # Check if we need an extra row.
          if num % ncols != 0:
            nrows += 1

          # Make the objects
          fig, axes = plt.subplots(nrows=nrows, ncols=ncols)

          # This is a bit awkward to get the indices, but my matplotlib version
          # doesn't use the same object type as prettyplotlib creates.
          posns = np.indices(axes.shape)
          x, y = posns[0].ravel(), posns[1].ravel()

        # Keep the mean, median, std.
        data_stats = {}
        for i, column in enumerate(self.columns):
          data = self.dataframe[column]
          data = data[np.isfinite(data)]
          if num_bins is None:
            num_bins = np.sqrt(len(data))

          data_stats[column] = [nanmean(data), nanstd(data), nanmedian(data)]

          if self.subplot:
            if use_prettyplotlib:
              plt.hist(axes[x[i], y[i]],data, num_bins, grid="y")
            else:
              axes[x[i], y[i]].hist(data, num_bins)
            axes[x[i], y[i]].set_xlabel(column)  # ADD UNITS!
          else:
            fig, axes = plt.subplots(1)
            axes.hist(data, num_bins)
            axes.set_xlabel(column)  # ADD UNITS!

          if self.verbose and not self.subplot:
            print column+" Stats: %s" % (data_stats[column])
            p.show()

          elif not self.subplot:
            fig.savefig(self.save_name+"_"+column+"."+self.save_type)
            p.close()

        if self.subplot:
          p.tight_layout()
          if self.verbose:
            for column in self.columns:
              print column+" Stats: %s" % (data_stats[column])
            p.show()
          else:
            fig.savefig(self.save_name+"_hists."+self.save_type)
Пример #54
0
 def test_nanmedian_none(self):
     """Check nanmedian when no values are nan."""
     m = stats.nanmedian(self.X)
     assert_approx_equal(m, np.median(self.X))
Пример #55
0
 def test_nanmedian_some(self):
     """Check nanmedian when some values only are nan."""
     m = stats.nanmedian(self.Xsome)
     assert_approx_equal(m, np.median(self.Xsomet))
Пример #56
0
 def test_nanmedian_all(self):
     """Check nanmedian when all values are nan."""
     m = stats.nanmedian(self.Xall)
     assert np.isnan(m)
Пример #57
0
import numpy as np
from scipy import stats

data = np.genfromtxt('ship-nmpg.csv',
                     delimiter=",",
                     names=True,
                     dtype="f8,i8,f8,f8,f8,f8,i8,i8,S35")

hpmean = stats.nanmean(data['hp'])

hpmedian = stats.nanmedian(data['hp'])

imputeHP = np.round((hpmean + hpmedian) / 2.0)

for i in range(len(data['hp'])):
    if np.isnan(data['hp'][i]):
        data['hp'][i] = imputeHP  ## assign value here

np.savetxt('ship-nmpg-imp.csv',
           data,
           delimiter=',',
           newline='\n',
           fmt="%f,%i,%f,%f,%f,%f,%i,%i,%s")
Пример #58
0
def mad(x):
	return stats.nanmedian(np.abs(x-stats.nanmedian(x)))