Exemplo n.º 1
0
def __calc_mul_multiband_cut_threshold():
    prefix = 197
    band_values = {k: [] for k in range(8)}
    band_cut_th = {k: dict(max=0, min=0) for k in range(8)}
    # for image_id in tqdm.tqdm(image_id_list[:500]):
    image_fn = "./img197/MUL-PanSharpen_AOI_3_Paris_img197.tif"
    with rasterio.open(image_fn, 'r') as f:
        values = f.read().astype(np.float32)
        for i_chan in range(8):
            values_ = values[i_chan].ravel().tolist()
            values_ = np.array([v for v in values_
                                if v != 0])  # Remove sensored mask
            band_values[i_chan].append(values_)

    # for image_id in tqdm.tqdm(image_id_list[:500]):
    image_fn = "./img197/MUL-PanSharpen_AOI_3_Paris_img197.tif"
    with rasterio.open(image_fn, 'r') as f:
        values = f.read().astype(np.float32)
        for i_chan in range(8):
            values_ = values[i_chan].ravel().tolist()
            values_ = np.array([v for v in values_
                                if v != 0])  # Remove sensored mask
            band_values[i_chan].append(values_)

    logger.info("Calc percentile point ...")
    for i_chan in range(8):
        band_values[i_chan] = np.concatenate(band_values[i_chan]).ravel()
        band_cut_th[i_chan]['max'] = scipy.percentile(band_values[i_chan], 98)
        band_cut_th[i_chan]['min'] = scipy.percentile(band_values[i_chan], 2)
    return band_cut_th
Exemplo n.º 2
0
def print_all_stats(ctx, series):
    ftime = get_ftime(series)
    start = 0 
    end = ctx.interval
    print('start-time, samples, min, avg, median, 90%, 95%, 99%, max')
    while (start < ftime):  # for each time interval
        end = ftime if ftime < end else end
        sample_arrays = [ s.get_samples(start, end) for s in series ]
        samplevalue_arrays = []
        for sample_array in sample_arrays:
            samplevalue_arrays.append( 
                [ sample.value for sample in sample_array ] )
        #print('samplevalue_arrays len: %d' % len(samplevalue_arrays))
        #print('samplevalue_arrays elements len: ' + \
               #str(map( lambda l: len(l), samplevalue_arrays)))
        # collapse list of lists of sample values into list of sample values
        samplevalues = reduce( array_collapser, samplevalue_arrays, [] )
        #print('samplevalues: ' + str(sorted(samplevalues)))
        # compute all stats and print them
        myarray = scipy.fromiter(samplevalues, float)
        mymin = scipy.amin(myarray)
        myavg = scipy.average(myarray)
        mymedian = scipy.median(myarray)
        my90th = scipy.percentile(myarray, 90)
        my95th = scipy.percentile(myarray, 95)
        my99th = scipy.percentile(myarray, 99)
        mymax = scipy.amax(myarray)
        print( '%f, %d, %f, %f, %f, %f, %f, %f, %f' % (
            start, len(samplevalues), 
            mymin, myavg, mymedian, my90th, my95th, my99th, mymax))

        # advance to next interval
        start += ctx.interval
        end += ctx.interval
Exemplo n.º 3
0
def fit_dispersion(counts, disp_raw, disp_conv, sf, CFG, dmatrix1):

    mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis]
    index = sp.where(disp_conv)[0]

    lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1)
    upperBound = sp.percentile(sp.unique(disp_raw[index]), 99)

    idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0]

    matrix = sp.ones((idx.shape[0], 2), dtype='float')
    matrix[:, 0] /= mean_count[idx].ravel()

    modGamma = sm.GLM(disp_raw[idx], matrix, family=sm.families.Gamma(sm.families.links.identity))
    res = modGamma.fit()
    Lambda = res.params

    disp_fitted = disp_raw.copy()
    ok_idx = sp.where(~sp.isnan(disp_fitted))[0]
    disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1]

    if sp.sum(disp_fitted > 0) > 0:
        print "Found dispersion fit"

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=counts,
                                disp=disp_fitted,
                                matrix=dmatrix1,
                                figtitle='Fitted Dispersion Estimate',
                                filename=os.path.join(CFG['plot_dir'], 'dispersion_fitted.pdf'),
                                CFG=CFG)

    return (disp_fitted, Lambda, idx)
Exemplo n.º 4
0
def fit_dispersion(counts, disp_raw, disp_conv, sf, options, dmatrix1, event_type):

    mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis]
    index = sp.where(disp_conv)[0]

    lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1)
    upperBound = sp.percentile(sp.unique(disp_raw[index]), 99)

    idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0]

    matrix = sp.ones((idx.shape[0], 2), dtype='float')
    matrix[:, 0] /= mean_count[idx].ravel()

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        modGamma = sm.GLM(disp_raw[idx], matrix, family=sm.families.Gamma(sm.families.links.identity()))
    res = modGamma.fit()
    Lambda = res.params

    disp_fitted = disp_raw.copy()
    ok_idx = sp.where(~sp.isnan(disp_fitted))[0]
    disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1]

    if sp.sum(disp_fitted > 0) > 0:
        print("\nFound dispersion fit")

    if options.diagnose_plots:
        plot.mean_variance_plot(counts=counts,
                                disp=disp_fitted,
                                matrix=dmatrix1,
                                figtitle='Fitted Dispersion Estimate',
                                filename=os.path.join(options.plot_dir, 'dispersion_fitted_%s.%s' % (event_type, options.plot_format)),
                                options=options)

    return (disp_fitted, Lambda, idx)
def calc_multiband_norm(path_dir, image_list, image_feature_norm_csv,
                        kind='RGB-PanSharpen', channel_count=3, max_sample=100):
    band_values = {k: [] for k in range(channel_count)}
    band_cut_th = {k: dict(max=0, min=0) for k in range(channel_count)}
    #  first get all data, then use first part 0:1000 to calc threshold, then update all data
    for image_id in tqdm.tqdm(image_list[:max_sample]):
        image_loc = get_image_path_based_type_imageid(path_dir, kind, image_id)
        with rasterio.open(image_loc, 'r') as f:
            values = f.read().astype(np.float32)
            for i_chan in range(channel_count):
                values_ = values[i_chan].ravel().tolist()
                values_ = np.array(
                    [v for v in values_ if v != 0]
                )  # Remove sensored mask
                band_values[i_chan].append(values_)

    logger.info("Calc percentile point for normalization")
    for i_chan in range(channel_count):
        band_values[i_chan] = np.concatenate(
            band_values[i_chan]).ravel()
        band_cut_th[i_chan]['max'] = scipy.percentile(
            band_values[i_chan], 98)
        band_cut_th[i_chan]['min'] = scipy.percentile(
            band_values[i_chan], 2)

    stat = dict()
    stat['path'] = path_dir
    for chan_i in band_cut_th.keys():
        stat['chan{}_max'.format(chan_i)] = band_cut_th[chan_i]['max']
        stat['chan{}_min'.format(chan_i)] = band_cut_th[chan_i]['min']
    pd.DataFrame(stat, index=[0]).to_csv(image_feature_norm_csv, index=False)
Exemplo n.º 6
0
def __calc_mul_multiband_cut_threshold(area_id, datapath):
    prefix = area_id_to_prefix(area_id)
    band_cut_th = {k: dict(max=0, min=0) for k in range(8)}

    image_id_list = pd.read_csv(FMT_VALTRAIN_IMAGELIST_PATH.format(
        prefix=prefix)).ImageId.tolist()
    image_id_list2 = pd.read_csv(FMT_VALTEST_IMAGELIST_PATH.format(
        prefix=prefix)).ImageId.tolist()
    image_id_list.extend(image_id_list2)

    for i_chan in range(8):
        logger.info("Reading band {} of the dataset..".format(i_chan))
        band_values = []
        for image_id in tqdm.tqdm(image_id_list[:500]):
            image_fn = get_train_image_path_from_imageid(
                image_id, datapath, mul=True)
            with rasterio.open(image_fn, 'r') as f:
                values = f.read().astype(np.float32)
                values_ = values[i_chan].ravel().tolist()
                values_ = np.array([v for v in values_ if v != 0]) # Remove sensored mask
                band_values.append(values_)

        logger.info("Calc percentile point for band {}".format(i_chan))
        band_values = np.concatenate(band_values).ravel()
        band_cut_th[i_chan]['max'] = scipy.percentile(
            band_values, 98)
        band_cut_th[i_chan]['min'] = scipy.percentile(
            band_values, 2, overwrite_input=True)
    return band_cut_th
Exemplo n.º 7
0
def cal_8band(file):
    print(file)
    ds = gdal.Open(path_main + r'\RGB-PanSharpen' + "\\" + file)
    data = ds.ReadAsArray()
    data3 = data.copy()
    # plt.imshow(grayImg)
    # plt.show()
    # img2 = np.array([datax,datax,datax,datax]).swapaxes(0,1).swapaxes(1,2)
    img = np.array(data3).swapaxes(0, 1).swapaxes(1, 2)
    # hsv_image = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
    # r_min = min(data3[0])
    # r_max = max(data3[0])
    # g_min = min(data3[1])
    # g_max = max(data3[1])
    # b_min = min(data3[2])
    # b_max = max(data3[2])
    # hsv_image = rgb_to_hsv(img)
    # h, s, v = cv2.split(hsv_image)
    # band_gray = cv2.cvtColor(res2, cv2.COLOR_RGB2GRAY)
    bandstats = {k: dict(max=0, min=0) for k in range(3)}
    for i in range(3):
        bandstats[i]['min'] = scipy.percentile(data3[i], 2)
        bandstats[i]['max'] = scipy.percentile(data3[i], 98)

    for chan_i in range(3):
        min_val = bandstats[chan_i]['min']
        max_val = bandstats[chan_i]['max']
        data3[chan_i] = np.clip(data3[chan_i], min_val, max_val)
        data3[chan_i] = (data3[chan_i] - min_val)
    tgi_band = (data3[1] - 0.39 * data3[0] - 0.61 * data3[2]) * 10
    grayImg = 0.0722 * data3[0] + 0.7152 * data3[1] + 0.2126 * data3[2]
    # plt.imshow(tgi_band)
    # plt.show()
    data4 = data3.copy()
    data4 = np.asarray(data4, dtype=np.float32)
    for chan_i in range(3):
        min_val = bandstats[chan_i]['min']
        max_val = bandstats[chan_i]['max']
        data4[chan_i] = (data4[chan_i] / (max_val - min_val))
    img_1 = np.array(data4).swapaxes(0, 1).swapaxes(1, 2)
    hsv_image = rgb_to_hsv(img_1)
    # plt.imshow(hsv_image)
    # plt.show()
    data5 = np.array(hsv_image).swapaxes(2, 1).swapaxes(1, 0)
    img2 = np.array([
        data3[0], data3[1], data3[2], data5[0] * 360, data5[1] * 100,
        data5[2] * 100, tgi_band, grayImg
    ]).swapaxes(0, 1).swapaxes(1, 2)

    output = path_main + r'\MUL-PanSharpen_2' + "\\" + file
    driver = gdal.GetDriverByName("GTiff")
    dst_ds = driver.Create(output, ds.RasterXSize, ds.RasterYSize,
                           (img2.shape[2]),
                           gdal.GDT_UInt16)  #gdal.GDT_Byte/GDT_UInt16
    for i in range(1, img2.shape[2] + 1):
        dst_ds.GetRasterBand(i).WriteArray(img2[:, :, i - 1])
        dst_ds.GetRasterBand(i).ComputeStatistics(False)
    dst_ds.SetProjection(ds.GetProjection())
    dst_ds.SetGeoTransform(ds.GetGeoTransform())
    return 0
Exemplo n.º 8
0
def main(database):

    #Commits per committer limited to the 30 first with the highest accumulated activity
    query = "select count(*) from scmlog group by committer_id order by count(*) desc limit 40"

    #Connecting to the data base and retrieving data
    connector = connect(database)
    results = int(connector.execute(query))
    if results > 0:
        results_aux = connector.fetchall()
    else:
        print("Error when retrieving data")
        return

    #Moving data to a list
    commits = []
    for commit in results_aux[5:]:
        #   for commits in results_aux:
        commits.append(int(commit[0]))

    #Calculating basic statistics
    print "max: " + str(sp.amax(commits))
    print "min: " + str(sp.amin(commits))
    print "mean: " + str(sp.mean(commits))
    print "median: " + str(sp.median(commits))
    print "std: " + str(sp.std(commits))
    print ".25 quartile: " + str(sp.percentile(commits, 25))
    print ".50 quartile: " + str(sp.percentile(commits, 50))
    print ".75 quartile: " + str(sp.percentile(commits, 75))
Exemplo n.º 9
0
def __calc_rgb_multiband_cut_threshold(area_id, datapath):
    prefix = area_id_to_prefix(area_id)
    band_values = {k: [] for k in range(3)}
    band_cut_th = {k: dict(max=0, min=0) for k in range(3)}

    image_id_list = pd.read_csv(
        FMT_VALTRAIN_IMAGELIST_PATH.format(prefix=prefix)).ImageId.tolist()
    for image_id in tqdm.tqdm(image_id_list[:500]):
        image_fn = get_train_image_path_from_imageid(image_id, datapath)
        with rasterio.open(image_fn, 'r') as f:
            values = f.read().astype(np.float32)
            for i_chan in range(3):
                values_ = values[i_chan].ravel().tolist()
                values_ = np.array([v for v in values_
                                    if v != 0])  # Remove sensored mask
                band_values[i_chan].append(values_)

    image_id_list = pd.read_csv(
        FMT_VALTEST_IMAGELIST_PATH.format(prefix=prefix)).ImageId.tolist()
    for image_id in tqdm.tqdm(image_id_list[:500]):
        image_fn = get_train_image_path_from_imageid(image_id, datapath)
        with rasterio.open(image_fn, 'r') as f:
            values = f.read().astype(np.float32)
            for i_chan in range(3):
                values_ = values[i_chan].ravel().tolist()
                values_ = np.array([v for v in values_
                                    if v != 0])  # Remove sensored mask
                band_values[i_chan].append(values_)

    logger.info("Calc percentile point ...")
    for i_chan in range(3):
        band_values[i_chan] = np.concatenate(band_values[i_chan]).ravel()
        band_cut_th[i_chan]['max'] = scipy.percentile(band_values[i_chan], 98)
        band_cut_th[i_chan]['min'] = scipy.percentile(band_values[i_chan], 2)
    return band_cut_th
Exemplo n.º 10
0
def sliding_window(im_data, window_width, baseline_percentile):
    """Calculate df/f using a sliding window of given width centered
    around each timepoint and take the nth percentile as the baseline.

    """

    result = np.empty(im_data.shape)
    half_width = np.ceil(window_width / 2)

    for (roi, timepoint, cycle), value in np.ndenumerate(im_data):
        # define the window extent
        if timepoint - half_width < 0:
            window_start = 0
        else:
            window_start = timepoint - half_width

        if timepoint + half_width > im_data.shape[1]:
            window_end = im_data.shape[1]
        else:
            window_end = timepoint + half_width
        # calculate the baseline as a percentile within the window
        baseline = percentile(im_data[roi, window_start:window_end, cycle],
                              baseline_percentile)
        baseline = percentile(im_data[roi, window_start:window_end, cycle],
                              baseline_percentile)

        # calculate df/f
        result[roi, timepoint, cycle] = (value - baseline) / baseline

    return result
Exemplo n.º 11
0
 def execLandMetric(self,name,nodata):        
     if name == "LC_Mean":            
         return unicode(name), numpy.mean(self.array[self.array!=nodata],dtype=numpy.float64)       
     if name == "LC_Sum":
         return unicode(name), numpy.sum(self.array[self.array!=nodata],dtype=numpy.float64)
     if name == "LC_Min":
         return unicode(name), numpy.min(self.array[self.array!=nodata],dtype=numpy.float64)
     if name == "LC_Max":
         return unicode(name), numpy.max(self.array[self.array!=nodata],dtype=numpy.float64)
     if name == "LC_SD":
         return unicode(name), numpy.std(self.array[self.array!=nodata],dtype=numpy.float64)
     if name == "LC_LQua":
         return unicode(name), scipy.percentile(self.array[self.array!=nodata],25)
     if name == "LC_Med":
         return unicode(name), numpy.median(self.array[self.array!=nodata],dtype=numpy.float64)
     if name == "LC_UQua":
         return unicode(name), scipy.percentile(self.array[self.array!=nodata],75)
     if name == "DIV_SH":
         if len(self.classes) == 1:
             func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING")
             return unicode(name), "NaN"
         else:
             return unicode(name), self.f_returnDiversity("shannon",nodata)
     if name == "DIV_EV":
         if len(self.classes) == 1:
             func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING")
             return unicode(name), "NaN"
         else:
             return unicode(name), self.f_returnDiversity("eveness",nodata)
     if name == "DIV_SI":
         if len(self.classes) == 1:
             func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING")
             return unicode(name), "NaN"
         else:
             return unicode(name), self.f_returnDiversity("simpson",nodata)
def main(database):

    # Commits per committer limited to the 30 first with the highest accumulated activity
    query = "select count(*) from scmlog group by committer_id order by count(*) desc limit 40"

    # Connecting to the data base and retrieving data
    connector = connect(database)
    results = int(connector.execute(query))
    if results > 0:
        results_aux = connector.fetchall()
    else:
        print ("Error when retrieving data")
        return

    # Moving data to a list
    commits = []
    for commit in results_aux[5:]:
        #   for commits in results_aux:
        commits.append(int(commit[0]))

    # Calculating basic statistics
    print "max: " + str(sp.amax(commits))
    print "min: " + str(sp.amin(commits))
    print "mean: " + str(sp.mean(commits))
    print "median: " + str(sp.median(commits))
    print "std: " + str(sp.std(commits))
    print ".25 quartile: " + str(sp.percentile(commits, 25))
    print ".50 quartile: " + str(sp.percentile(commits, 50))
    print ".75 quartile: " + str(sp.percentile(commits, 75))
Exemplo n.º 13
0
 def execLandMetric(self,name,nodata):        
     if name == "LC_Mean":            
         return unicode(name), numpy.mean(self.array[self.array!=nodata],dtype=numpy.float64)       
     if name == "LC_Sum":
         return unicode(name), numpy.sum(self.array[self.array!=nodata],dtype=numpy.float64)
     if name == "LC_Min":
         return unicode(name), numpy.min(self.array[self.array!=nodata])
     if name == "LC_Max":
         return unicode(name), numpy.max(self.array[self.array!=nodata])
     if name == "LC_SD":
         return unicode(name), numpy.std(self.array[self.array!=nodata],dtype=numpy.float64)
     if name == "LC_LQua":
         return unicode(name), scipy.percentile(self.array[self.array!=nodata],25)
     if name == "LC_Med":
         return unicode(name), numpy.median(self.array[self.array!=nodata])
     if name == "LC_UQua":
         return unicode(name), scipy.percentile(self.array[self.array!=nodata],75)
     if name == "DIV_SH":
         if len(self.classes) == 1:
             func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING")
             return unicode(name), "NaN"
         else:
             return unicode(name), self.f_returnDiversity("shannon",nodata)
     if name == "DIV_EV":
         if len(self.classes) == 1:
             func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING")
             return unicode(name), "NaN"
         else:
             return unicode(name), self.f_returnDiversity("eveness",nodata)
     if name == "DIV_SI":
         if len(self.classes) == 1:
             func.DisplayError(self.iface,"LecoS: Warning" ,"This tool needs at least two landcover classes to calculate landscape diversity!","WARNING")
             return unicode(name), "NaN"
         else:
             return unicode(name), self.f_returnDiversity("simpson",nodata)
Exemplo n.º 14
0
def test_single_parameter_percentile():
    dist_f = PercentileDistanceFunction(measures_to_use=["a"])
    abc = MockABC([{"a": -3}, {"a": 3}, {"a": 10}])
    dist_f.initialize(abc.sample_from_prior())
    d = dist_f({"a": 1}, {"a": 2})
    expected = (
        1 / (sp.percentile([-3, 3, 10], 80) - sp.percentile([-3, 3, 10], 20)))
    assert expected == d
Exemplo n.º 15
0
    def stats(self, *args, **kwargs):
        import scipy as sp
        result = {}

        cp.thread_data.conn.execute(
            "SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED")

        just_diffs = '''
		select 
		upm.eqdiff, 
		(upm.ratinguser - upm.increment) - (upm.ratingpos + upm.increment)
		from usersposmatchids upm
		where upm.eqdiff is not null
		and upm.increment is not null
		and upm.submittedat >= date_add(utc_timestamp(), interval - %s day)
		and upm.plasubms >= %s
		and upm.possubms >= %s
		limit 100000
		'''

        # First remove the positions where there is no previous rating.
        rows = [
            r for r in cp.thread_data.conn.execute(just_diffs, [
                int(kwargs.get('ld', 5000)),
                int(kwargs.get('pls', 20)),
                int(kwargs.get('pos', 30))
            ]) if r[1] is not None
        ]
        ratings = [r[1] for r in rows]
        diffs = [r[0] for r in rows]

        rating_quintiles = sp.percentile(ratings, [20, 40, 60, 80])

        result['mean_rating_of_quintile_to_figs'] = {}

        last_quint = -1000
        for quint in rating_quintiles + [100000]:
            diffs = [r[0] for r in rows if last_quint < r[1] < quint]
            ratings = [r[1] for r in rows if last_quint < r[1] < quint]
            mean_rating_of_quintile = sp.mean(ratings)
            if len(diffs) > 0:
                n = len(diffs)
                mean = sp.mean(diffs)
                std = sp.std(diffs)
                quartiles = sp.percentile(diffs, [25, 50, 75])
                last_quint = quint

                figs = {}
                result['mean_rating_of_quintile_to_figs'][
                    mean_rating_of_quintile] = figs
                figs['n'] = n
                figs['mean'] = mean
                figs['std'] = std
                figs['quartiles'] = quartiles

        cp.thread_data.conn.commit()

        return result
Exemplo n.º 16
0
def get_list_statistics(lst):
    return {
        'min': amin(lst),
        'max': amax(lst),
        'avg': mean(lst),
        'median': median(lst),
        'std': std(lst),
        'q1': percentile(lst, 25),
        'q3': percentile(lst, 75)
    }
Exemplo n.º 17
0
def key_stats(values):
    return dict({
            'mean'   : sp.mean(values),
            'std'    : sp.std(values)/sp.sqrt(len(values)),
            'max'    : sp.array(values).max(),
            'min'    : sp.array(values).min(),
            'median' : sp.median(values),
            'p25'    : sp.percentile(values, 25),
            'p75'    : sp.percentile(values, 75),
            'values' : sp.array(values)
            })
Exemplo n.º 18
0
def Disp(Img,vmin=0,vmax=0,fname=""): 
    from scipy import percentile
    import matplotlib.pyplot as plt


    if (vmin==vmax):
        vmin=percentile(Img,2)
        vmax=percentile(Img,98)
    plt.imshow(Img,vmin=vmin,vmax=vmax,cmap = plt.get_cmap('gray'),interpolation='None')
    plt.axis('off')
    if (fname!=""):
        plt.savefig(fname,bbox_inches='tight')
Exemplo n.º 19
0
def stats(log_files):
    for log_file in log_files:
        print log_file
        f = open(log_file).read()
        print 'OVERALL'
        for field, num in re.findall(r'\[OVERALL\], ([a-zA-Z()/]*), (\d*)', f):
            print '%s\t%s' % (field.rjust(20), num)
        print 'READ'
        for field, num in re.findall(r'\[READ\], ([a-zA-Z()/]*), (\d*)', f):
            print '%s\t%s' % (field.rjust(20), num)
        read_lat = [float(x)
                    for x in re.findall(r'\[READ\], \d*, ([\d.]*)', f)]
        c = scipy.percentile(read_lat, 25)
        o = scipy.percentile(read_lat, 75)
        h = scipy.percentile(read_lat, 99)
        l = scipy.percentile(read_lat, 1)
        print '%s\t%s' % ('open'.rjust(20), o)
        print '%s\t%s' % ('high'.rjust(20), h)
        print '%s\t%s' % ('low'.rjust(20), l)
        print '%s\t%s' % ('close'.rjust(20), c)
        print 'INSERT'
        for field, num in re.findall(r'\[INSERT\], ([a-zA-Z()/]*), (\d*)', f):
            print '%s\t%s' % (field.rjust(20), num)
        insert_lat = [float(x)
                      for x in re.findall(r'\[INSERT\], \d*, ([\d.]*)', f)]
        c = scipy.percentile(insert_lat, 25)
        o = scipy.percentile(insert_lat, 75)
        h = scipy.percentile(insert_lat, 90)
        l = scipy.percentile(insert_lat, 10)
        print '%s\t%s' % ('open'.rjust(20), o)
        print '%s\t%s' % ('high'.rjust(20), h)
        print '%s\t%s' % ('low'.rjust(20), l)
        print '%s\t%s' % ('close'.rjust(20), c)
Exemplo n.º 20
0
def resize_original_im(img):
    data3 = img.copy()
    bandstats = {k: dict(max=0, min=0) for k in range(3)}
    for i in range(3):
        bandstats[i]['min'] = scipy.percentile(data3[i], 2)
        bandstats[i]['max'] = scipy.percentile(data3[i], 98)

    for chan_i in range(3):
        min_val = bandstats[chan_i]['min']
        max_val = bandstats[chan_i]['max']
        data3[chan_i] = np.clip(data3[chan_i], min_val, max_val)
        data3[chan_i] = (data3[chan_i] - min_val) / (max_val - min_val) * 255
    img2 = np.array(data3).swapaxes(0, 1).swapaxes(1, 2)
    return img2
Exemplo n.º 21
0
def plot_correlations_vi(model,
                         input,
                         features_names: List = None,
                         save_path=None):
    """ Plot the correlations """

    output = model.forward(*input)
    pred = output[2]
    num_dim = pred.shape[1]
    data_x = input[0]

    fig, axes = plt.subplots(num_dim,
                             1,
                             figsize=(3 * 1, 3 * num_dim),
                             squeeze=False,
                             sharex=False,
                             sharey=False)
    for ix in range(num_dim):
        axes[ix, 0].axhline(y=0, xmin=-1, xmax=1, linestyle="--", color='red')
        axes[ix, 0].plot(data_x[:, ix].cpu().data.numpy(),
                         pred[:, ix].cpu().data.numpy(),
                         ms=4,
                         marker=".",
                         linestyle="")

        min_val = scipy.percentile(data_x[:, ix], 1)
        max_val = scipy.percentile(data_x[:, ix], 99)
        axes[ix, 0].set_xlim([min_val, max_val])

        min_val = scipy.percentile(pred[:, ix].cpu().data.numpy(), 1)
        max_val = scipy.percentile(pred[:, ix].cpu().data.numpy(), 99)
        axes[ix, 0].set_ylim([min_val, max_val])

        if features_names is None:
            axes[ix, 0].set_xlabel(f"target_{ix}")
            axes[ix, 0].set_ylabel(f"pred_{ix}")
        else:
            axes[ix, 0].set_xlabel(f"{features_names[ix]}")
            axes[ix, 0].set_ylabel(f"pred_{features_names[ix]}")

    fig.suptitle("Correlation between predictor and indicator")
    # plt.show()

    if save_path is not None:
        fig.tight_layout(rect=[0, 0.03, 1, 0.95])
        fig.savefig(save_path, bbox_inches='tight', format='png', dpi=200)
        plt.close(fig)

    return fig
Exemplo n.º 22
0
def get_color(listing, listings, f, cm):
    price = f(listing)
    prices = [f(l) for l in listings]

    lower = sp.percentile(prices, 10)
    upper = sp.percentile(prices, 90)

    relative_price = (price - lower)/(upper - lower)
    color = cm(sp.clip(relative_price, 0, 1))

    is_dark = sum(color[:3])/4 < 0.4
    background_color = tuple([int(255*c) for c in color[:3]])
    text_color = (230, 230, 230) if is_dark else (50, 50, 50)

    return background_color, text_color
Exemplo n.º 23
0
    def make_features(self, s):
        from scipy import percentile

        features_koh, blocks = Kohlschuetter.make_features(s)
        if features_koh is None:
            return None, blocks

        features = np.zeros((features_koh.shape[0], 6 + 4 + 2))
        features[:, :6] = features_koh[:]

        # a global feature based on connected blocks of long text
        # inspired by Arias
        block_lengths = np.array([len(block.text) for block in blocks])
        index = block_lengths.argmax()
        k = 6
        for c in [0.15, 0.3333]:
            for window in [1, 4]:
                cutoff = int(percentile(block_lengths, 97) * c)
                lowindex, highindex = KohlschuetterExpanded.strip(block_lengths, index, window, cutoff)
                features[lowindex:(highindex + 1), k] = 1.0
                k += 1

        features[:, -2:] = capital_digit_features(blocks)
        normalize_features(features, self._mean_std)
        return features, blocks
Exemplo n.º 24
0
def discretizeY(Y, col, firstThresh=33.3333, secondThresh=66.6666):
    '''
    Discretize and returns and specific column of Y. The strategy is:
    to keep the data with score <=33rd percentile be the "low" group,
    score >=66th percentile be the "high" group, and the middle be the
    "medium" group.
    '''
    y = Y[:, col]
    if kwlist[col] == 'Totalviews':
        y = np.log(y)
    lowthresh = sp.percentile(y, firstThresh)
    hithresh = sp.percentile(y, secondThresh)
    y[y <= lowthresh] = -1  # Low group
    y[y >= hithresh] = 1  # High group
    y[(y > lowthresh) * (y < hithresh)] = 0  # Medium group
    return y
Exemplo n.º 25
0
Arquivo: core.py Projeto: jumbokh/gwpy
    def percentile(self, percentile):
        """Calculate a given spectral percentile for this `Spectrogram`.

        Parameters
        ----------
        percentile : `float`
            percentile (0 - 100) of the bins to compute

        Returns
        -------
        spectrum : `~gwpy.frequencyseries.FrequencySeries`
            the given percentile `FrequencySeries` calculated from this
            `SpectralVaraicence`
        """
        out = scipy.percentile(self.value, percentile, axis=0)
        name = '%s %s%% percentile' % (self.name, percentile)
        return FrequencySeries(
            out,
            epoch=self.epoch,
            channel=self.channel,
            name=name,
            f0=self.f0,
            df=self.df,
            frequencies=(hasattr(self, '_frequencies') and self.frequencies
                         or None))
def two_channel_to_color(im):
    """Converts a two-channel microarray image to a color image, as described in the paper associated with this 
    codebase"""
    lower = sp.percentile(im, 5)
    upper = sp.percentile(im, 98)   
    
    channel_0 = sp.clip((im[:, :, 0] - lower)/(upper - lower), 0, 1)
    channel_2 = sp.clip((im[:, :, 1] - lower)/(upper - lower), 0, 1)
    channel_1 = ((channel_0 + channel_2)/2.)
    
    im = sp.array((channel_0, channel_1, channel_2))
    im = sp.rollaxis(im, 0, 3)
    
    im = (255*im).astype(sp.uint8)    
    
    return im
Exemplo n.º 27
0
	def get_zbins(cat, nzbin):
		edges = [0]
		for i in range(1,nzbin+1):
	     		edges += [sp.percentile(cat['PHOTOZ_GAUSSIAN'], i*100./nzbin)]
	     		print 'bin %d'%i

		return edges
Exemplo n.º 28
0
def plotprofile(confs,nreps,path,tol=0.9,target=1e-6):
    f=[]
    a=[]
    pmax=1
    for i in range(pmax):
        f_,a_ = plt.subplots(1)
        for item in ([a_.title, a_.xaxis.label, a_.yaxis.label] + a_.get_xticklabels() + a_.get_yticklabels()):
            item.set_fontsize(10)
        f.append(f_)
        a.append(a_)
    colorlist = ['b','r','g','purple','k','grey','orange','c','lightgreen','lightblue','pink','b','r','g','purple','k','grey','orange','c','lightgreen','lightblue','pink']
    lslist = ['solid' , 'dashed', 'dashdot', 'dotted','solid' , 'dashed', 'dashdot', 'dotted','solid' , 'dashed', 'dashdot', 'dotted','solid' , 'dashed', 'dashdot', 'dotted','solid' , 'dashed', 'dashdot', 'dotted']
    ci=-1

    for C in confs:
        print('plotting {}...'.format(C[0]))
        ci+=1
        col = colorlist[ci]
        line = lslist[ci]
        #collect the data
        data=[]
        xoverheads = sp.empty(nreps)
        xntotarget = sp.zeros(nreps)
        overheads = sp.empty(nreps)
        noverheads = [sp.empty(nreps) for i in range(len(C[1]['N']))]
        ninrun = sp.zeros(nreps)
        support = sp.logspace(-2,5,200)
        success = sp.zeros(nreps)
        for ii in range(nreps):
            D = gpbo.optimize.readoptdata(os.path.join(path,'{}_{}.csv'.format(C[0],ii)))
            A = sp.array(D['trueyatxrecc'].values)
            if A.min()>=target:
                xoverheads[ii]=sum(D['taq'])
                overheads[ii]=sum(D['taq'])
                for k,n in enumerate(C[1]['N']):
                    noverheads[k][ii] = sum(D['taq'][:n])
            else:
                success[ii]=1
                i = sp.argmax(A<=target)#while A[i]>=target:
                xoverheads[ii] = sum(D['taq'][:i+1])
                overheads[ii] = sum(D['taq'])
                xntotarget[ii] = i
                ninrun[ii]=len(D['taq'])
                for k,n in enumerate(C[1]['N']):
                    noverheads[k][ii] = sum(D['taq'].values[:n])
        if sp.mean(success)>=tol:
            if C[1]['oracle']:
                a[0].plot(support,sp.mean(xoverheads)+sp.mean(xntotarget)*support,col,label=C[0]+'oracle',linestyle='dashdot')
            if C[1]['full']:
                a[0].plot(support,sp.mean(overheads)+sp.mean(ninrun)*support,col,label=C[0]+'_all',linestyle='dashed')
            for k,n in enumerate(C[1]['N']):
                if sp.percentile(xntotarget,int(tol*100))<n:
                    a[0].plot(support,sp.mean(noverheads[k])+n*support,col,label=C[0]+str(n),linestyle='solid')
        else:
            print('{} only acchived target on {}'.format(C[0],sp.mean(success)))
        a[0].set_xscale('log')
        a[0].set_yscale('log')
        a[0].legend()

    f[0].savefig(os.path.join(path,'profile_{}.png'.format(sp.log10(target))),bbox_inches='tight', pad_inches=0.1)
Exemplo n.º 29
0
def credibility_interval(post, alpha=1.):
    """Calculate bayesian credibility interval.
    Parameters:
    -----------
    post : array_like
        The posterior sample over which to calculate the bayesian credibility
        interval.
    alpha : float, optional
        Confidence level.
    Returns:
    --------
    med : float
        Median of the posterior.
    low : float
        Lower part of the credibility interval.
    up : float
        Upper part of the credibility interval.
    """
    z = erf(alpha / sp.sqrt(2))

    lower_percentile = 100 * (1 - z) / 2
    upper_percentile = 100 * (1 + z) / 2
    low, med, up = sp.percentile(post,
                                 [lower_percentile, 50, upper_percentile])
    return med, low, up
Exemplo n.º 30
0
 def func(ms_):
     # 極端に歪んだ分布でない限り,MAP推定値は95%信用区間の中には入っているだろう
     lo, hi = scipy.percentile(ms_, q=[2.5, 97.5])
     kde = gaussian_kde(ms_)
     xs = scipy.linspace(lo, hi, 1000)
     ys = kde.evaluate(xs)
     return xs[scipy.argmax(ys)]
def getSizeFactor(fn_anno, data, gid, mode = 'sum', withXYMT = True, filterbyPC = True):
    '''
    input annotation, counts and gene ids
    output sum of protein coding gene levels excluding sex chromosomes and mitochondria genes
    '''
    anno  = sp.loadtxt(fn_anno, delimiter = '\t', dtype = 'string', usecols=[0,2,8])
    anno  = anno[anno[:,1] == 'gene', :]
    if not withXYMT: ### filter xymt
        anno  = anno[anno[:,0] != 'MT',:]
        anno  = anno[anno[:,0] != 'Y',:]
        anno  = anno[anno[:,0] != 'X',:]

    agid   = [x.split(';')[0] for x in anno[:,2]] ### clean gene id's
    agid   = sp.array([x.split(" ")[1].strip('\"') for x in agid])

    if filterbyPC: ### filter protein coding
        gtpe  = [x.split(';')[2] for x in anno[:,2]]
        gtpe  = sp.array([x.split('\"')[1].split('\"')[0] for x in gtpe])
        iPC   = sp.where(gtpe == 'protein_coding')[0]
        agid  = agid[iPC]

    iGn = sp.in1d(gid, agid)
    libsize = sp.sum(data[iGn,:], axis = 0) 
    if mode == 'uq':
         libsize = sp.array([sp.percentile(x[x!=0] ,75) for x in data[iGn,:].T])  * iGn.sum() 

    return libsize
def two_channel_to_color(im):
    """Converts a two-channel microarray image to a color image, as described in the paper associated with this 
    codebase"""
    lower = sp.percentile(im, 5)
    upper = sp.percentile(im, 98)

    channel_0 = sp.clip((im[:, :, 0] - lower) / (upper - lower), 0, 1)
    channel_2 = sp.clip((im[:, :, 1] - lower) / (upper - lower), 0, 1)
    channel_1 = ((channel_0 + channel_2) / 2.)

    im = sp.array((channel_0, channel_1, channel_2))
    im = sp.rollaxis(im, 0, 3)

    im = (255 * im).astype(sp.uint8)

    return im
Exemplo n.º 33
0
def skip_extrem(im):
    # Get percentile
    pm, pM = sp.percentile(im, [2, 98])
    ims = im
    ims[im < pm] = pm
    ims[im > pM] = pM
    return ims
Exemplo n.º 34
0
    def alt_results(self, samples, kplanets):
        titles = sp.array(["Period","Amplitude","Longitude", "Phase","Eccentricity", 'Acceleration', 'Jitter', 'Offset', 'MACoefficient', 'MATimescale', 'Stellar Activity'])
        namen = sp.array([])
        ndim = kplanets * 5 + self.nins*2*(self.MOAV+1) + self.totcornum + 1 + self.PACC

        RESU = sp.zeros((ndim, 5))
        for k in range(kplanets):
            namen = sp.append(namen, [titles[i] + '_'+str(k) for i in range(5)])
        namen = sp.append(namen, titles[5])  # for acc
        if self.PACC:
            namen = sp.append(namen, 'Parabolic Acceleration')
        for i in range(self.nins):
            namen = sp.append(namen, [titles[ii] + '_'+str(i+1) for ii in sp.arange(2)+6])
            for c in range(self.MOAV):
                namen = sp.append(namen, [titles[ii] + '_'+str(i+1) + '_'+str(c+1) for ii in sp.arange(2)+8])
        for h in range(self.totcornum):
            namen = sp.append(namen, titles[-1]+'_'+str(h+1))

        if self.PM:
            for g in range(self.nins_pm):
                for gg in range(self.lenppm):
                    namen = sp.append(namen, 'Photometry param'+str(g)+'_'+str(gg+1))

        alt_res = list(map(lambda v: (v[2], v[3]-v[2], v[2]-v[1], v[4]-v[2], v[2]-v[0]),
                      zip(*np.percentile(samples, [2, 16, 50, 84, 98], axis=0))))
        logdat = '\nAlternative results with uncertainties based on the 2nd, 16th, 50th, 84th and 98th percentiles of the samples in the marginalized distributions'
        logdat = '\nFormat is like median +- 1-sigma, +- 2-sigma'
        for res in range(ndim):
            logdat += '\n'+namen[res]+'     : '+str(alt_res[res][0])+' +- '+str(alt_res[res][1:3]) +'    2%   +- '+str(alt_res[res][3:5])
            RESU[res] = sp.percentile(samples, [2, 16, 50, 84, 98], axis=0)[:, res]
        print(logdat)
        return RESU
Exemplo n.º 35
0
def plotquartsends(a,xdata_, ydata_,col,line,lab,log=False,mean=False):
    xdata = [sp.array(i) for i in xdata_]
    ydata = [sp.array(i) for i in ydata_]
    n = len(xdata)
    ints = []
    starts = sp.empty(n)
    ends = sp.empty(n)
    yends = sp.empty(n)
    for i in xrange(n):
        starts[i] = xdata[i][0]
        ends[i] = xdata[i][-1]
        yends[i] = ydata[i][-1]
    yendorder = sp.argsort(yends)

    Ydata = [sp.hstack([y[0], y, y[-1]]) for y in ydata]
    #the pad values are slightly outside the true range to so that exp(log(value)) stays in the interpolation range
    Xdata = [sp.hstack([0.999*min(starts), x, max(ends)*1.001]) for x in xdata]
    #print(min(starts),max(ends))
    for i in xrange(n):
        #print(Xdata[i][0],Xdata[i][-1])
        ints.append(sp.interpolate.interp1d(Xdata[i], Ydata[i]))
       # a.plot(Xdata[i], Ydata[i], 'lightblue')

    if log:
        x = sp.logspace(sp.log10(min(starts)), sp.log10(max(ends)), 200)
    else:
        x = sp.linspace(min(starts), max(ends), 200)

    #print(x)
    if mean:
        a.plot(x, map(lambda x: sp.mean([i(x) for i in ints]), x), color=col,label=lab)
    else:
        a.plot(x, map(lambda x: sp.percentile([i(x) for i in ints], 50), x), color=col,label=lab)
        #m = map(lambda x: sp.mean([i(x) for i in ints]), x)
        #v = map(lambda x: sp.mean([i(x) for i in ints]), x)
        #a.plot(x, map(lambda x: sp.mean([i(x) for i in ints]), x), color=col,label=lab)

        y25 = map(lambda x: sp.percentile([i(x) for i in ints], 25), x)
        y75 = map(lambda x: sp.percentile([i(x) for i in ints], 75), x)
        a.fill_between(x,y25,y75,edgecolor=col, facecolor=col,lw=0.0,alpha=0.1)
    #a.plot(ends[yendorder], yends[yendorder], '.',color=col ,linestyle=line)
    #print("endvalues: {}".format(yends))
    a2 = a.twinx()
    a2.grid(False)
    a2.plot(ends[sp.argsort(ends)],sp.linspace(1,0,n),color=col, linestyle='--',linewidth=0.4)
    a2.set_ylabel('fraction of optimizations still running')
    return
Exemplo n.º 36
0
    def topWords(self,vote,plotWordCloud=True):

        freqs = list(self.wordRelativeFreqDic.values())

        topWord = {}
        percLimit = 1
        # print("Top words on ",vote)
        if vote=='Yes':
            thr = sp.percentile(freqs,q=100-percLimit)
        else:
            thr = sp.percentile(freqs,q=percLimit)
        total = 0
        for pair in self.wordRelativeFreqDic.items():
            if (vote=='Yes' and pair[1]>thr) or (vote=='No' and pair[1]<thr):
                topWord[pair[0]] = abs(pair[1])
                total += topWord[pair[0]]

        if plotWordCloud:
            for key in topWord:
                topWord[key] = topWord[key]/total
                print(key,topWord[key])


            wordcloud = WordCloud(max_font_size=40, relative_scaling=.5,background_color='white',max_words=50).generate_from_frequencies(topWord.items())

            plt.figure()
            plt.imshow(wordcloud)
            plt.axis("off")
            plt.savefig('Temp/WordCloud_TopOnly'+vote+'.png')
            plt.close()

        listKeys = list(topWord.keys())
        try:
            listKeys.remove('sim')
        except:
            pass
        try:
            listKeys.remove('nao')
        except:
            pass

        return listKeys

# topDif = topDifferentWords(getVoteData())
#
# print(topDif.topWords('Yes'))
# print(topDif.topWords('No'))
Exemplo n.º 37
0
 def returnArrayHigherQuant(self,array):
     if numpy.size(array) != 0 and self.count_nonzero(array) != 0:
         try:
             return scipy.percentile(array[array!=self.nodata],75)
         except ValueError:
             return None
     else:
         return None
Exemplo n.º 38
0
 def returnArrayHigherQuant(self, array):
     if numpy.size(array) != 0 and self.count_nonzero(array) != 0:
         try:
             return scipy.percentile(array[array != self.nodata], 75)
         except ValueError:
             return None
     else:
         return None
Exemplo n.º 39
0
    def _add_colorbar_(self,axcolorbar=None,
                       label= None,
                       verticale=True,
                       no_ticks=True,
                       add_legend=True):
        """
        """
        if axcolorbar is None:
            if verticale:
                self.axcbar = self.fig.add_axes([0.9,0.10,0.03,0.80])
            else:
                self.axcbar = self.fig.add_axes([0.10,0.9,0.80,0.04])
        else:
            self.axcbar = axcolorbar

        # ----------------- #
        vmin,vmax = self._skyPlot_color_ranges_
        norm = P.matplotlib.colors.Normalize(vmin=vmin,
                                            vmax=vmax)
        
        if verticale:
            x,y=N.mgrid[1:10:0.05,1:10]
            self._colorbar_ = self.axcbar.imshow(10-x, cmap=self.scatter_cmap)
        else:
            x,y=N.mgrid[1:10,1:10:0.1]
            self._colorbar_ = self.axcbar.imshow(10-x, cmap=self.scatter_cmap)
        
        if label is not None:
            self.axcbar.set_xlabel(label,fontsize=fontsize_label)
            
        if no_ticks:
            self.axcbar.set_xticks([])
            self.axcbar.set_yticks([])
            
        if add_legend:
            if verticale:
                loc = (0.5,1.)
            else:
                loc = (1.,.5)
            print "legend"
            self.axcbar.text(loc[0],loc[1],r"$\mathrm{%s}$"%self._skyPlot_colored_by_,
                             
                        fontsize="large",
                        va="bottom",ha="center",
                        transform=self.axcbar.transAxes,
                        )
            range_percent_to_show = [0,0.33,0.66,1]
            if verticale:
                [self.axcbar.text(1.02,x,r"$%+.1e$"%(percentile(self._colored_values_,(x*(vmax-vmin) + vmin )*100)),fontsize="small",
                        va="center",ha="left",
                        transform=self.axcbar.transAxes)
                for x in range_percent_to_show]
            else:
                [self.axcbar.text(1.0-x*0.95,-.2,r"$%+.1e$"%(x*100),fontsize="small",
                        va="top",ha="center",
                        transform=self.axcbar.transAxes)
                for x in range_percent_to_show]
Exemplo n.º 40
0
    def _load_sky_scatter_color_(self,colored_by,default_color="b",
                                 vmin=0,vmax=1):
        """
        """
        self._skyPlot_colored_by_    = colored_by
        self._skyPlot_color_ranges_  = [vmin, vmax]
        if colored_by is None:
            self._colored_values_ = None
            self._color_used_ = default_color
            return None
        
        if colored_by not in dir(self.Samp):
            raise ValueError("Sorry I don't have any %s in module self.Samp"%colored_by)

        self._colored_values_ = N.asarray(self.Samp.__dict__[colored_by])

        self._color_used_     = self.scatter_cmap((self._colored_values_-percentile(self._colored_values_,vmin*100.) )\
                                                   / (percentile(self._colored_values_,vmax*100.)-percentile(self._colored_values_,vmin*100.)))
Exemplo n.º 41
0
	def find_photoz_bin_edges(self, nbin, sel=None, binning='mode'):
		if sel==None:	sel = np.in1d(self.data['pz_cond_%s'%binning], self.data['pz_cond_%s'%binning])

		self.data['tomographic_bin_edges'] = [ min(self.data['pz_cond_%s'%binning][sel]) ]
		for i in range(1,nbin+1):
			edge = sp.percentile(self.data['pz_cond_%s'%binning][sel],i*100.0/nbin)
			self.data['tomographic_bin_edges'] += [edge]
		self.data['tomographic_bin_edges'] += [max(self.data['pz_cond_%s'%binning][sel])]
		self.data['tomographic_bin_edges'] = np.array(self.data['tomographic_bin_edges'])
def pairwise_dists(data, nneighbors=10, folder='model', dist='l2'):
    '''

    Computes pairwise distances between bag-of-words vectors of articles

    INPUT
    folder      model folder
    nneighbors  number of closest neighbors to include in distance list

    '''
    stopwords = codecs.open("stopwords.txt", "r", encoding="utf-8", errors='ignore').readlines()[5:]
    stops = map(lambda x:x.lower().strip(),stopwords)

    # using now stopwords and filtering out digits
    bow = TfidfVectorizer(min_df=2,stop_words=stops)
    X = bow.fit_transform(data)
    print 'Computing %s pairwise distances'%dist
    # KPCA transform bow vectors
    if dist is 'l2_kpca_zscore':
        K = pairwise_distances(X,metric='l2',n_jobs=1)
        perc = 50.0
        width = percentile(K.flatten(),perc)
        Xc = zscore(KernelPCA(n_components=50,kernel='rbf',gamma=width).fit_transform(X))
        K = pairwise_distances(Xc,metric='l2',n_jobs=1)
    elif dist is 'l2_kpca':
        K = pairwise_distances(X,metric='l2',n_jobs=1)
        perc = 100./len(data)
        width = percentile(K.flatten(),perc)
        Xc = KernelPCA(n_components=50,kernel='rbf',gamma=width).fit_transform(X)
        K = pairwise_distances(Xc,metric='l2',n_jobs=1)
    elif dist is 'l2':
        K = pairwise_distances(X,metric='l2',n_jobs=1)
    elif dist is 'l1':
        K = pairwise_distances(X,metric='l1',n_jobs=1)

    # collect closest neighbors
    distances = []
    for urlidx in range(len(data)):
        idx =  (K[urlidx,:]).argsort()[1:nneighbors+1]
        for sidx in idx:
            distances.append([urlidx,sidx,(idx==sidx).nonzero()[0][0]])

    return distances
Exemplo n.º 43
0
 def f_MedianPatchArea(self,cl=None,name="Median patch area",niv=50):
     res = []
     for group in self.groups:
         r = self.returnGroupArea(group,cl)
         try:
             v = scipy.percentile(r,niv)
         except ValueError: # Catch empty array
             v = "NULL"
         res.append( [group,name,v] )
     return res
def kpca_cluster(data,nclusters=100,ncomponents=40,topwhat=10,zscored=False):
    '''

    Computes clustering of bag-of-words vectors of articles

    INPUT
    folder      model folder
    nclusters   number of clusters

    '''
    from sklearn.cluster import KMeans
    # filtering out some noise words
    stops = map(lambda x:x.lower().strip(),open('stopwords.txt').readlines()[6:])

    # vectorize non-stopwords 
    bow = TfidfVectorizer(min_df=2,stop_words=stops)
    X = bow.fit_transform(data)

    # creating bow-index-to-word map
    idx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys()))

    # using now stopwords and filtering out digits
    print 'Computing pairwise distances' 
    K = pairwise_distances(X,metric='l2',n_jobs=1)
    perc = 50.0
    width = percentile(K.flatten(),perc)

    # KPCA transform bow vectors
    Xc = KernelPCA(n_components=ncomponents,kernel='rbf',gamma=width).fit_transform(X)
    
    if zscored:
        Xc = zscore(Xc)
    
    # compute clusters
    km = KMeans(n_clusters=nclusters).fit(Xc)
    Xc = km.predict(Xc)

    clusters = []
    for icluster in range(nclusters):
        nmembers = (Xc==icluster).sum()
        if True:#nmembers < len(data) / 5.0 and nmembers > 1: # only group clusters big enough but not too big
            members = (Xc==icluster).nonzero()[0]
            topwordidx = array(X[members,:].sum(axis=0))[0].argsort()[-topwhat:][::-1]
            topwords = ' '.join([idx2word[wi] for wi in topwordidx])
            meanDist = triu(pairwise_distances(X[members,:],metric='l2',n_jobs=1)).sum()
            meanDist = meanDist / (len(members) + (len(members)**2 - len(members))/2.0)
            # print u'Cluster %d'%icluster + u' %d members'%nmembers + u' mean Distance %f'%meanDist + u'\n\t'+topwords
            clusters.append({
                'name':'Cluster-%d'%icluster,
                'description': topwords,
                'members': list(members),
                'meanL2Distances': meanDist
                })

    return clusters
def __calc_mul_multiband_cut_threshold(area_id, datapath):
    prefix = area_id_to_prefix(area_id)
    band_values = {k: [] for k in range(8)}
    band_cut_th = {k: dict(max=0, min=0) for k in range(8)}

    image_id_list = pd.read_csv(FMT_VALTRAIN_IMAGELIST_PATH.format(
        prefix=prefix)).ImageId.tolist()
    for image_id in tqdm.tqdm(image_id_list[:500]):
        image_fn = get_train_image_path_from_imageid(
            image_id, datapath, mul=True)
        with rasterio.open(image_fn, 'r') as f:
            values = f.read().astype(np.float32)
            for i_chan in range(8):
                values_ = values[i_chan].ravel().tolist()
                values_ = np.array(
                    [v for v in values_ if v != 0]
                )  # Remove sensored mask
                band_values[i_chan].append(values_)

    image_id_list = pd.read_csv(FMT_VALTEST_IMAGELIST_PATH.format(
        prefix=prefix)).ImageId.tolist()
    for image_id in tqdm.tqdm(image_id_list[:500]):
        image_fn = get_train_image_path_from_imageid(
            image_id, datapath, mul=True)
        with rasterio.open(image_fn, 'r') as f:
            values = f.read().astype(np.float32)
            for i_chan in range(8):
                values_ = values[i_chan].ravel().tolist()
                values_ = np.array(
                    [v for v in values_ if v != 0]
                )  # Remove sensored mask
                band_values[i_chan].append(values_)

    logger.info("Calc percentile point ...")
    for i_chan in range(8):
        band_values[i_chan] = np.concatenate(
            band_values[i_chan]).ravel()
        band_cut_th[i_chan]['max'] = scipy.percentile(
            band_values[i_chan], 98)
        band_cut_th[i_chan]['min'] = scipy.percentile(
            band_values[i_chan], 2)
    return band_cut_th
Exemplo n.º 46
0
Arquivo: arias.py Projeto: rw/dragnet
    def __call__(self, blocks, train=False):
        from scipy import percentile

        features = np.zeros((len(blocks), AriasFeatures.nfeatures))

        block_lengths = np.array([len(block.text) for block in blocks])
        index = block_lengths.argmax()
        cutoff = int(percentile(block_lengths, self._percent_cutoff))
        lowindex, highindex = AriasFeatures.strip(block_lengths, index, self._window, cutoff)
        features[lowindex : (highindex + 1), 0] = 1.0
        return features
Exemplo n.º 47
0
def fit_dispersion(counts, disp_raw, disp_conv, sf, CFG):

    mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis]
    index = sp.where(disp_conv)[0]

    lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1)
    upperBound = sp.percentile(sp.unique(disp_raw[index]), 99)

    idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0]

    matrix = sp.ones((idx.shape[0], 2), dtype='float')
    matrix[:, 0] /= mean_count[idx].ravel()

    modGamma = sm.GLM(disp_raw[idx], matrix, family=sm.families.Gamma(sm.families.links.identity))
    res = modGamma.fit()
    Lambda = res.params

    disp_fitted = disp_raw.copy()
    ok_idx = sp.where(~sp.isnan(disp_fitted))[0]
    disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1]

    if sp.sum(disp_fitted > 0) > 0:
        print "Found dispersion fit"

    if CFG['debug']:
        fig = plt.figure(figsize=(8, 6), dpi=100)
        ax = fig.add_subplot(111)
        idx = sp.where(~sp.isnan(disp_fitted))[0]
        ax.plot(sp.mean(sp.log10(counts + 1), axis=1)[idx], disp_fitted[idx], 'bo')
        ax.set_title('Fitted Dispersion Estimate')
        ax.set_xlabel('Mean expression count')
        ax.set_ylabel('Dispersion')
        plt.savefig('dispersion_fitted.pdf', format='pdf', bbox_inches='tight')
        plt.close(fig)

    return (disp_fitted, Lambda, idx)
Exemplo n.º 48
0
    def percentile(self, percentile):
        """Calculate a given spectral percentile for this `Spectrogram`.

        Parameters
        ----------
        percentile : `float`
            percentile (0 - 100) of the bins to compute

        Returns
        -------
        spectrum : `~gwpy.spectrum.Spectrum`
            the given percentile `Spectrum` calculated from this
            `SpectralVaraicence`
        """
        out = scipy.percentile(self.value, percentile, axis=0)
        name = '%s %s%% percentile' % (self.name, percentile)
        return Spectrum(out, epoch=self.epoch, channel=self.channel,
                        name=name, f0=self.f0, df=self.df,
                        frequencies=(hasattr(self, '_frequencies') and
                                     self.frequencies or None))
Exemplo n.º 49
0
def compute_profiles(data_grouped):
    profiles = {}
    r2_min = +1e16
    r2_max = -1e16

    for (index, resonance_id), profile in data_grouped.items():

        mag_ref = sp.mean(
            [data_pt.val for data_pt in profile if data_pt.par['ncyc'] == 0]
        )

        r2_profile = []

        for data_pt in profile:

            ncyc = data_pt.par['ncyc']
            time_t2 = data_pt.par['time_t2']

            frq = ncyc / time_t2

            if frq:
                mag_cal = data_pt.cal
                mag_exp = data_pt.val
                mag_err = data_pt.err
                mag_ens = sp.random.normal(mag_exp, mag_err, 10000)

                r2_cal = -sp.log(mag_cal / mag_ref) / time_t2
                r2_exp = -sp.log(mag_exp / mag_ref) / time_t2
                r2_ens = -sp.log(mag_ens / mag_ref) / time_t2
                r2_err = abs(sp.percentile(r2_ens, [15.9, 84.1]) - r2_exp)
                r2_erd, r2_eru = r2_err

                r2_profile.append([frq, r2_cal, r2_exp, r2_erd, r2_eru])

                r2_min = min(r2_min, r2_cal, r2_exp - r2_erd)
                r2_max = max(r2_max, r2_cal, r2_exp + r2_eru)

        r2_profile = zip(*sorted(r2_profile))
        profiles.setdefault((index, resonance_id), []).append(r2_profile)

    return profiles, r2_min, r2_max
Exemplo n.º 50
0
    def percentile(self, percentile):
        """Calculate a given spectral percentile for this `Spectrogram`.

        Parameters
        ----------
        percentile : `float`
            percentile (0 - 100) of the bins to compute

        Returns
        -------
        spectrum : `~gwpy.frequencyseries.FrequencySeries`
            the given percentile `FrequencySeries` calculated from this
            `SpectralVaraicence`
        """
        out = scipy.percentile(self.value, percentile, axis=0)
        if self.name is not None:
            name = '{}: {} percentile'.format(self.name, _ordinal(percentile))
        else:
            name = None
        return FrequencySeries(out, epoch=self.epoch, channel=self.channel,
                               name=name, f0=self.f0, df=self.df,
                               frequencies=(hasattr(self, '_frequencies') and
                                            self.frequencies or None))
Exemplo n.º 51
0
def summarize_sampler(sampler, burn=0, thin=1, ci=0.95):
    r"""Create summary statistics of the flattened chain of the sampler.
    
    The confidence regions are computed from the quantiles of the data.
    
    Parameters
    ----------
    sampler : :py:class:`emcee.EnsembleSampler` instance
        The sampler to summarize the chains of.
    burn : int, optional
        The number of samples to burn from the beginning of the chain. Default
        is 0 (no burn).
    thin : int, optional
        The step size to thin with. Default is 1 (no thinning).
    ci : float, optional
        A number between 0 and 1 indicating the confidence region to compute.
        Default is 0.95 (return upper and lower bounds of the 95% confidence
        interval).
    
    Returns
    -------
    mean : array, (num_params,)
        Mean values of each of the parameters sampled.
    ci_l : array, (num_params,)
        Lower bounds of the `ci*100%` confidence intervals.
    ci_u : array, (num_params,)
        Upper bounds of the `ci*100%` confidence intervals.
    """
    flat_trace = sampler.chain[:, burn::thin, :]
    flat_trace = flat_trace.reshape((-1, flat_trace.shape[2]))
    
    mean = scipy.mean(flat_trace, axis=0)
    cibdry = 100.0 * (1.0 - ci) / 2.0
    ci_l, ci_u = scipy.percentile(flat_trace, [cibdry, 100.0 - cibdry], axis=0)
    
    return (mean, ci_l, ci_u)
Exemplo n.º 52
0
def calc_noise(wav_filename, bins=None):
    window_buffers, sample_rate = stft.get_buffers_from_file(wav_filename)

    bins = numpy.empty([
        stft.WINDOWSIZE/2 + 1,
        len(window_buffers),
        ])
    for i, window_buffer in enumerate(window_buffers):
        fft_amplitude = stft.stft_amplitude(window_buffer)
        bins[:,i] = fft_amplitude
    print bins.shape

    freqs = [ stft.bin2hertz(i, sample_rate)
        for i in range(stft.WINDOWSIZE/2 + 1) ]
    #for i, window_buffer in enumerate(window_buffers):
    #    pylab.plot(freqs,
    #        stft.amplitude2db(bins[:,i]),
    #        color="blue")

    noise = numpy.empty(len(bins[:,0]))
    means = numpy.empty(len(bins[:,0]))
    mins = numpy.empty(len(bins[:,0]))
    stds = numpy.empty(len(bins[:,0]))
    for i, bin_spot in enumerate(bins):
        detected_noise = scipy.percentile(bin_spot,
            defs.NOISE_PERCENTILE_BELOW)
        #noise[i] = stft.db2amplitude(stft.amplitude2db(detected_noise))
        noise[i] = detected_noise
        means[i] = scipy.mean(bin_spot)
        mins[i] = bin_spot.min()
        stds[i] = scipy.std(bin_spot, ddof=1)
        #if i == 100:
        #   numpy.savetxt("noise.csv", bin_spot, delimiter=', ')

    #return noise, freqs, variance
    return noise, freqs, means, mins, stds
Exemplo n.º 53
0
def E_step(x1,x0,lam,data):
    a1=scipy.log(ss.norm.pdf(data, loc=x1[0], scale=x1[1])*lam)
    a0=scipy.log(ss.norm.pdf(data, loc=x0[0], scale=x0[1])*(1-lam))
    lratio=a1-a0
    return inv_logit(lratio)

eps=1e-4
data1=np.array(ss.norm.rvs(loc=20,scale=5,size=1000))
data0=np.array(ss.norm.rvs(loc=0,scale=5,size=300))
data=np.concatenate([data1,data0])
x1_old=np.array([6,1])
x0_old=np.array([-3,2])
xx_old=np.concatenate((x1_old,x0_old))
Z=np.ones(len(data))*0.5
Z[data>scipy.percentile(data,90)]=1
Z[data<scipy.percentile(data,10)]=0
#Z[1:10]=1
#Z[-10:-1]=0
lam=0.5
ans=np.zeros(len(data))
ans[0:1001]=1
diff=1
cnt=0
while diff>eps:
    lam=sum(Z)/len(data)
    res = op.minimize(f_sum,xx_old ,args=(data, Z, lam), method='Nelder-Mead', options={'xtol': 1e-8, 'disp': False})
    xx=res.x
    diff=max(abs(xx-xx_old))
    x1=xx[0:2]
    x0=xx[2:]
Exemplo n.º 54
0
def plotBias(vals, fn_plot, myidx, logScale = False, refname = 'TCGA'):

    iqr    = ( (sp.percentile(vals[~myidx],75) - sp.percentile(vals[~myidx],25) ) * 1.5)
    iqr2    = ( (sp.percentile(vals[myidx],75) - sp.percentile(vals[myidx],25) ) * 1.5)

    sidx   = sp.argsort(vals)
    vals   = vals[sidx]
    myidx = myidx[sidx]

    fig  = plt.figure(figsize=(12,10))
    ax   = fig.add_subplot(111)
    ax_c = ax.twinx()
    ax.vlines(sp.array(sp.arange(sp.sum(vals.shape[0])))[myidx],[0], vals[myidx], label = '%s Reference'%refname)
    ax.vlines(sp.array(sp.arange(sp.sum(vals.shape[0])))[~myidx],[0], vals[~myidx], color = 'r', label = 'Your Samples')

    ax.plot([0,vals.shape[0]],[3,3], '--', color = 'green')
    ax.plot([0,vals.shape[0]],[5,5] , '--',color = 'green')
    ax.plot([0,vals.shape[0]],[iqr + sp.percentile(vals[~myidx], 75),iqr + sp.percentile(vals[~myidx], 75)], '--',color = 'green')
    ax.plot([0,vals.shape[0]],[iqr2 + sp.percentile(vals[myidx], 75),iqr2 + sp.percentile(vals[myidx], 75)], '--',color = 'green')

#    ax.plot([0,vals.shape[0]],[6.25,6.25],'--', color = 'green')
    ax.plot([0,vals.shape[0]],[10,10] , '--',color = 'green')
    ax.set_ylabel('Median 3\'/5\' Bias')
    ax.set_xlim(0,vals.shape[0])
    if logScale:
        ax.set_yscale('log')
        ax_c.set_yscale('log')
    ax_c.set_ylim(ax.get_ylim())

    ### add right side ticks
    if logScale:       
        tick_thresholds = sp.array([3,5,iqr+sp.percentile(vals[~myidx],75),iqr2 + sp.percentile(vals[myidx], 75), 10])#sp.array(sp.log([3,5,iqr+sp.percentile(vals,75), 10, 50]))
    else:
        tick_thresholds = sp.array([3,5,iqr+sp.percentile(vals[~myidx],75),iqr2 + sp.percentile(vals[myidx], 75), 10])
    tick_idx        = sp.argsort(tick_thresholds)
    tick_thresholds = tick_thresholds[tick_idx]
    tick_thresholds = sp.around(tick_thresholds, decimals = 2)
    ax_c.set_yticks(tick_thresholds)

    tick_thresholds                = tick_thresholds.astype('|S4')
    tick_thresholds                = tick_thresholds.astype('|S50')
    tick_thresholds[tick_idx == 2] = tick_thresholds[tick_idx == 2][0] + ' (Your Filter)'
#    tick_thresholds[tick_idx == 3] = tick_thresholds[tick_idx == 3][0] + ' (PRAD Filter)'
    tick_thresholds[tick_idx == 3] = tick_thresholds[tick_idx == 3][0] + ' (%s Filter)'%(refname)

    ax_c.set_yticklabels(tick_thresholds)


    ax.grid()
    ax.legend(loc=2)
    plt.tight_layout()
    plt.savefig(fn_plot, dpi = 300)
    plt.clf()
Exemplo n.º 55
0
import pandas as pd
import random
random.seed(10)
import numpy as np
from scipy import percentile
import json

df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])

out = []
for row in df.T.itertuples(index=False):
    out.append({'max': max(row), 'min': min(row), 
                'Q1': percentile(row, 25), 
                'median': percentile(row, 50), 
                'Q3': percentile(row, 75)})
print out

with open("../quartiles/quartiles.json", "w") as f:
    json.dump(out, f)