예제 #1
0
def clean_outliers(points):
    boundaries_array = np.array(points)
    # slope, intercept, r_value, p_value, std_err
    print(boundaries_array)
    slope, intercept, _, _, _ = stats.linregress(boundaries_array)
    boundaries_array = remove_outliers(boundaries_array, slope, intercept)
    slope, intercept, _, _, _ = stats.linregress(boundaries_array)
    return remove_outliers(boundaries_array, slope, intercept)
예제 #2
0
def slop(bin,binwidth):
    outputlist = [["Bin", "\t", "Frequency", "\t", "Slope1", "\t", "Slope2", "\t", "peak-Width", "\t", "peak-Apex", "\t","intercept_mass", "\n"]]
    slope1 = [0]
    for index in range(0, len(bin) - 6):
        tempD=dict(itertools.islice(bin.items(), index,index + 7))
        s, intercept, r, p, std_error = linregress(list(tempD.keys()), list(tempD.values()))
        slope1.append(s)
    slope2 = []
    for index1 in range(0, len(bin) - 13):
        tempD=dict(itertools.islice(bin.items(), index1 + 3,index1 + 10))
        #print(index1,len(tempD),len(slope1[index1 + 1:index1 + 8]))
        s1, intercept1, r1, p1, std_error1 = linregress(list(tempD.values()), slope1[index1 + 1:index1 + 8])
        slope2.append(s1)
    apex = []
    peak = []
    interceptList = [0]
    if len(bin) % 2 == 0:
        minus1 = 6
        minus2 = 3
    else:
        minus1 = 7
        minus2 = 3
    for index3 in range(len(bin) - minus1):
        if slope1[index3] > 0.0 and slope1[index3 + 1] < 0.0:
            apex.append("1")
        else:
            apex.append("0")
    for index4 in range(len(bin) - 13):
        if slope2[index4] < 0:
            peak.append("1")
        else:
            peak.append("0")
    slope1 = [0] * 2 + slope1 + [0] * 3
    slope2 = [0] * 6 + slope2 + [0] * 6
    apex = [0] * 3 + apex + [0] * (len(bin) - (len(apex) + 3))
    peak = [0] * 6 + peak + [0] * 6
    for index6 in range(len(bin) - 6):
        if (abs(slope1[index6 + 1]) + abs(slope1[index6 + 2])) == 0.0:
            intercept_mass = float("inf")
            interceptList.append(intercept_mass)
        else:
            tempD=dict(itertools.islice(bin.items(), index6,index6))
            intercept_mass = list(tempD.values()) + (float(binwidth) * abs(slope1[index6 + 1])) / (
                abs(slope1[index6 + 1]) + abs(slope1[index6 + 2]))
            interceptList.append(intercept_mass)
    interceptList = interceptList + [0] * (len(bin) - len(interceptList))
    plot_x = []
    plot_y = []
    for index5 in range(len(bin)-13):
        tempD=dict(itertools.islice(bin.items(), index5,index5))
        outputlist.append([str(list(tempD.values())), "\t", str(slope1[index5]), "\t", str(slope1[index5]), "\t", str(slope2[index5]),"\t", str(peak[index5]), "\t", str(apex[index5]), "\t", str(interceptList[index5]), "\n"])
    return outputlist
예제 #3
0
    def test_multinomial_elementwise_distribution(self):
        '''Verify that the created variables approach a multinomial distribution for large numbers
        of samples.'''
        (m, n, k) = (6, 5, 1)
        r = 2 ** np.arange(4, 17)
        p = statutil.random_row_stochastic((m, n))
        #p = statutil.scale_row_sums(np.ones((m, n)))
        error = np.zeros((len(r),))
        for (i, r_val) in enumerate(r):
            for _ in xrange(k):
                x = statutil.multinomial_elementwise(p, r_val)
                # Root-mean-square-error of observed frequencies w.r.t. desired frequencies
                error[i] += statutil.norm_frobenius_scaled(statutil.hist(x, n) / (1.0 * r_val) - p)
            error[i] /= (1.0 * k)
        # Validate the model error of the central limit theorem: C*r^(-0.5).
        # This is a consequence of the Central Limit Theorem. We are making k experiments for
        # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations
        # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each
        # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors
        # should be (with 95% confidence) <= n * (1.96*s[i])^2. So 
        # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m).
        # See http://en.wikipedia.org/wiki/Central_limit_theorem
        alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error))
        c = np.exp(c)
#        print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - 
#                                                          np.sum(p * np.arange(p.shape[1]), axis=1) ** 2,
#                                                          2) / np.sqrt(p.shape[0]),
        assert_almost_equal(alpha, -0.5, decimal=1, err_msg='Unexpected error term growth power')
        self.assertTrue(c <= 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - 
                                                          np.sum(p * np.arange(p.shape[1]), axis=1) ** 2,
                                                          2) / np.sqrt(p.shape[0]),
                        'Error term coefficient outside 95% confidence interval')
        self.assertTrue(abs(r_value) > 0.99, 'Error does not fit a power law in sample size')
예제 #4
0
def calculate_monthly_lapse_rates(csv, station_meta):
    mdf = read_csv(csv,
                   sep=' ',
                   infer_datetime_format=True,
                   index_col=0,
                   parse_dates=True)
    mdf = mdf.groupby(mdf.index.month).mean()
    with open(station_meta, 'r') as js:
        stations = json.load(js)

    tmin_lapse, tmax_lapse = [], []
    for temp in ['tmin', 'tmax']:
        for month in range(1, 13):
            temps, elevations = [], []
            cols = [c for c in mdf.columns if temp in c]
            d = mdf[cols]
            [
                temps.append(d['{}_{}'.format(s, temp)].loc[month])
                for s in stations.keys()
            ]
            [elevations.append(v['elev']) for k, v in stations.items()]
            regression = linregress(elevations, temps)
            if temp == 'tmin':
                tmin_lapse.append('{:.3f}'.format(regression.slope * 1000.))
            else:
                tmax_lapse.append('{:.3f}'.format(regression.slope * 1000.))

    print('tmax_lapse = {}'.format(', '.join(tmax_lapse)))
    print('tmin_lapse = {}'.format(', '.join(tmin_lapse)))

    print('station elevations')
    elevs = sorted([(v['zone'], v['elev']) for k, v in stations.items()],
                   key=lambda x: x[0])
    print(', '.join([str(x[1]) for x in elevs]))
예제 #5
0
def reuse_model_reg(X_test, y_test, wildcard_name, ws=os.getcwd(), save=True):

    misc_output_path = os.path.join(os.getcwd(), 'output_rs_learn', 'misc')

    if not os.path.exists(misc_output_path):
        os.makedirs(misc_output_path)

    prediction_list = []
    feature_list = []
    for tuned_model in glob.glob(
            os.path.join(ws, 'output_rs_learn', 'tuned_models',
                         f'{wildcard_name}*.sav')):

        model_trained = joblib.load(tuned_model)
        model_name = os.path.basename(tuned_model)[:-4]
        prediction = model_trained.predict(X_test)

        prediction_list.append(prediction)
        feature_list.append(model_name)

        slope, intercept, r_value, p_value, std_err = stats.linregress(
            y_test, prediction)
        r2 = r2_score(y_test, prediction)
        rmse = sqrt(mean_squared_error(prediction, y_test))
        percent_err = ((prediction - y_test) / y_test) * 100
        mnb = np.mean(percent_err)

        print(f'{model_name} r: %.2f, r2: %.2f, rmse: %.2f, mnb: %.2f' %
              (r_value, r2, rmse, mnb))

    df_prediction = pd.DataFrame(prediction_list).T
    df_prediction.columns = feature_list

    return df_prediction
예제 #6
0
def rest_task_regression():
    for tpt in [tpt_cole, tpt_sh]:
        fig, axs = plt.subplots(2, 3, figsize=(16, 10), sharex="row", sharey="row")
        txt = None
        for li, (lib, name, lbl) in enumerate(lib_details):
            df = lib.gen_long_data(tpt) \
                .groupby(["task", "region", "network"]).mean().reset_index() \
                .convert_column(metric=lambda x: x * 1000)
            df_rest = df.and_filter(task="Rest")
            txt = []
            for ti, task in enumerate(task_order(False)):
                dft = pd.merge(df_rest, df.and_filter(task=task), on=["region", "network"])
                ax = axs[li, ti]
                sns.scatterplot(data=dft, x="metric_x", y=f"metric_y", hue="network", hue_order=tpt.net_order,
                                ax=ax, palette=tpt.net_colors)
                slope, intercept, r_value, _, _ = stats.linregress(dft.metric_x, dft.metric_y)
                sns.lineplot(dft.metric_x, slope * dft.metric_x + intercept, ax=ax, color='black')
                ax.text(0.3, 0.8, f"$r^2$={r_value ** 2:.2f}***", ha='center', va='center', transform=ax.transAxes)
                ax.set(xlabel=f"Rest {lbl}", ylabel="")
                ax.get_legend().remove()
                txt.append(ax.text(-0.15 if ti == 0 else -0.05, 0.5, f"{task} {lbl}",
                                   transform=ax.transAxes, rotation=90, va='center', ha='center'))
        legend_handles = []
        for net, color, label in zip(tpt.net_order, tpt.net_colors, tpt.net_labels(break_space=False)):
            legend_handles.append(Line2D([], [], color=color, marker='o', linestyle='None', markersize=5, label=label))
        n_col = 6 if len(tpt.net_order) == 12 else 7
        lgn = fig.legend(handles=legend_handles, loc=2, ncol=n_col, handletextpad=0.1, mode="expand",
                         bbox_to_anchor=(0.12, -0.04, 0.785, 1))
        print(savefig(fig, f"regression.{tpt}", extra_artists=txt + [lgn, ], low=False))
예제 #7
0
def calculate_histogram_sizes(tracks_queue, config, out_queue):
    params = config['tracking']['process']
    df = DataFrame()
    sleep(5)
    while True:
        while not tracks_queue.empty() or tracks_queue.qsize() > 0:
            data = tracks_queue.get()
            df = df.append(data)

        if len(df) % 100 == 0:
            # t1 = tp.filter_stubs(df, params['min_traj_length'])
            # print(t1.head())
            # t2 = t1[((t1['mass'] > params['min_mass']) & (t1['size'] < params['max_size']) &
            #          (t1['ecc'] < params['max_ecc']))]
            # print(t2.head())
            # t2 = t1
            # d = tp.compute_drift(t1)
            # tm = tp.subtract_drift(t2.copy(), d)
            im = tp.imsd(df, config['tracking']['process']['um_pixel'], config['camera']['fps'])
            values = []
            for pcle in im:
                data = im[pcle]
                slope, intercept, r, p, stderr = stats.linregress(np.log(data.index), np.log(data.values))
                values.append([slope, intercept])

            out_queue.put(values)
def fit_exponential_func(y, x):
	## Fit with y = Ae^(Bx) -> logy = logA + Bx
	# Returns A and B of a function as: y = A*e^(Bx)

	B, logA, r_value, p_value, std_err = linregress(np.transpose(x.values), np.log(y))
	
	return np.exp(logA), B  
예제 #9
0
def find_consensus(unassigned, sample, is_vertical):
    """Attempt to find a set of measurements that forms a consensus, in the list of measurements.
    
    Args:
        unassigned (list): List of unassigned measurements.
        sample (list): Measurements that fit the extracted line.
        is_vertical (bool): Whether the landmark is close to being vertical.
        
    Returns:
        list: List of measurements that fit the line.
    """
    cartesian_sample = numpy.array([point.location for point in sample])
    cartesian_unassigned = numpy.array(
        [point.location for point in unassigned])
    consensus = []

    # If almost vertical, calculate line in terms of y.
    if is_vertical:
        cartesian_sample = numpy.fliplr(cartesian_sample)
        cartesian_unassigned = numpy.fliplr(cartesian_unassigned)

    # Calculate regression line.
    slope, intercept, r_, p_, e_ = stats.linregress(cartesian_sample[:, 0],
                                                    cartesian_sample[:, 1])

    # Find the unassigned points that match to this line.
    for i in range(len(unassigned)):
        # If the point lies close enough to the line.
        if util.point_line_dist(cartesian_unassigned[i], slope,
                                intercept) < RANSAC_TOLERANCE:
            consensus.append(unassigned[i])  # Add it to the consensus points.

    return consensus
예제 #10
0
def plot_mean_boxplot_with_pearson(dataset_id):
    data = []
    pearson = []
    for i, technique_id in enumerate(technique_list):
        print(Globals.acronyms[technique_id], end=' ', flush=True)
        technique_pearson = []
        technique_data = []
        history = Parser.parse_rectangles(technique_id, dataset_id)
        for revision in range(len(history) - 1):
            delta_vis = DeltaMetrics.compute_delta_vis(history[revision],
                                                       history[revision + 1])
            delta_data = DeltaMetrics.compute_delta_data(
                history[revision], history[revision + 1])
            un_mov = UnavoidableMovement.compute_unavoidable_movement(
                history[revision], history[revision + 1])

            ratios = (1 - delta_vis) / (1 - delta_data)
            diffs = 1 - abs(delta_vis - delta_data)
            unavoidable = 1 - (delta_vis - un_mov)
            mean = (ratios + diffs + unavoidable) / 3
            technique_data.append(mean)

            # Compute linear regression statistics
            _, _, r_value, _, _ = stats.linregress(delta_data, delta_vis)
            technique_pearson.append(r_value if r_value > 0 else 0)

        data.append(technique_data)
        pearson.append(technique_pearson)

    TimeBoxplot.plot_with_pearson(data,
                                  technique_list,
                                  pearson,
                                  title='Mean with Pearson - ' + dataset_id)
예제 #11
0
파일: analysis.py 프로젝트: vsoch/alleninf
def approximate_random_effects(data, labels, group):

    correlation_per_donor = {}
    for donor_id in set(data[group]):
        correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]),list(data[labels[1]][data[group] == donor_id]))
    average_slope = np.array(correlation_per_donor.values()).mean()
    t, p_val = ttest_1samp(correlation_per_donor.values(), 0)
    print "Averaged slope across donors = %g (t=%g, p=%g)"%(average_slope, t, p_val)    
    return average_slope, t, p_val
예제 #12
0
파일: dot.py 프로젝트: wpwupingwp/python
def draw_fit(x, y):
    range = arg.regression
    x = x[:range]
    y = y[:range]
    slope, intercept, r_value, *_ = linregress(x, y)
    text = r'$f(x)={0:.3f}x+{1:.3f}, R^2={2:.3f}$'.format(
        slope, intercept, r_value**2)
    fit = [slope*i+intercept for i in x]
    plt.plot(x[:arg.regression], fit, 'k--')
    plt.annotate(text, xy=(x[-1], y[-2]))
예제 #13
0
def regress(my_dict):
    count = 0
    x = []
    y = []
    for k, v, in my_dict.items():
        x.append(count)
        count += 1
        y.append(v)
    m, b, r, p, std_err = linregress(x, y)
    print("b = " + str(b) + ", m = " + str(m) + ", r^2 = " + str(r * r))
예제 #14
0
def scatter_plot(ssu_df, fg_df):
    ssu_iden, fg_iden, fg_siml = ssu_df['identity(%)'], fg_df['identity(%)'], fg_df['similarity(%)']
    fig = plt.figure(figsize=(15,7),dpi=300)
    gs = gridspec.GridSpec(1,2,wspace=0.2,left=0.05, right=0.95)
#   correlation plot of 16S rRNA identity versus funtional gene identity
    ax0 = plt.subplot(gs[0])
    plt.scatter(ssu_iden,fg_iden,color='blue',s=1)
    iden_func = stats.linregress(ssu_iden,fg_iden)
    x_rg = range(int(min(ssu_iden)),int(max(ssu_iden))+1)
    y_rg = np.polyval([iden_func[0],iden_func[1]],x_rg)
    plt.text(5,95, r'$y =  %.2f x  %s $' % (iden_func[0],intercept(iden_func[1])), fontsize=15)
    plt.text(5,90, r'$R^2=%.4f$' % (iden_func[2]**2))
    plt.text(5,85, r'$P-value=%.2e$' % (iden_func[3]))
    plt.text(5,80, r'$StdErr=%.4f$' % (iden_func[4]))
    plt.title('16S rRNA identity vs. Funtional gene identity')
    plt.plot(x_rg,y_rg,'r--',label='line 1')   
    plt.xlabel('16S rRNA gene identity (%)')
    plt.ylabel('Funtional gene identity (%)')
    plt.ylim(0,100)
    plt.xlim(0,100)
#   correlation plot of 16S rRNA identity versus funtional gene similarity
    ax1 = plt.subplot(gs[1])
    plt.scatter(ssu_iden,fg_siml,color='green',s=1)
    siml_func = stats.linregress(ssu_iden,fg_siml)
    x_rg = range(int(min(ssu_iden)),int(max(ssu_iden))+1)
    y_rg = np.polyval([siml_func[0],siml_func[1]],x_rg)
    plt.text(5,95, r'$y =  %.2f x  %s $' % (siml_func[0],intercept(siml_func[1])), fontsize=15)
    plt.text(5,90, r'$R^2=%.4f$' % (siml_func[2]**2))
    plt.text(5,85, r'$P-value=%.2e$' % (siml_func[3]))
    plt.text(5,80, r'$StdErr=%.4f$' % (siml_func[4]))
    plt.title('16S rRNA identity vs. Funtional gene similarity')
    (m,b) = np.polyfit(ssu_iden,fg_siml, 1)
    x_rg = range(int(min(ssu_iden)),int(max(ssu_iden))+1)
    y_rg = np.polyval([m,b],x_rg)
    plt.plot(x_rg,y_rg,'r--')
    plt.xlabel('16S rRNA gene identity (%)')
    plt.ylabel('Funtional gene similarity (%)')
    plt.ylim(0,100)
    plt.xlim(0,100)
    plt.savefig(o_dir+'/correlation_plot.pdf')
    return iden_func, siml_func
예제 #15
0
 def _linear_regression(self):
     """
     Final trend is expressed as a linear interpolation of the trend-signal obtained after the deseasonal processor
     of the input signal.
     :return:
     """
     line = np.asarray(self.d).copy()
     # line = filter_outlier(np.asarray(temporal_series).copy(), nsigma=1)
     xx = np.arange(0, len(line), 1)
     slope, intercept, r_value, p_value, std_err = stats.linregress(
         xx[~np.isnan(line)], line[~np.isnan(line)])
     return slope, intercept, p_value, np.square(r_value), std_err
예제 #16
0
def linregress(x_vals, y_vals):
    '''
    least-squares regression of scipy
    '''
    a_value, b_value, r_value, p_value, std_err = stats.linregress(x_vals,y_vals)
    est_yvals = a_value * pylab.array(x_vals) + b_value
    k = 1 / a_value
    # plot regression line
    print p_value, std_err
    pylab.plot(x_vals, est_yvals, label='Least-squares fit, k = ' + str(round(k)) +
               ", RSquare = " + str(r_value**2))
    pylab.legend(loc='best')
예제 #17
0
def plot_regression(df, x, y, extra_names={}):
    '''Plot a regression with annotated statistics.'''
    # ugly hack to include origin in plot bounds
    plt.clf()
    ax = _do_plot(df, x, y)
    xlim, ylim = ax.get_xlim(), ax.get_ylim()
    ax.cla()
    ax.set_xlim(*xlim)
    ax.set_ylim(*ylim)
    _do_plot(df, x, y, ax=ax)

    # calculate some regression statistics...
    info = [
        ("{} = " + ("{}" if isinstance(v, int) else "{:.2f}")).format(k, v)
        for k, v in it.chain(
            zip(
                [
                    'Slope',
                    'Intercept',
                    '$R^2$',
                    '$p$',
                    'Standard Error',
                ],
                stats.linregress(df[x], df[y]),
            ), [('$n$', len(df))])
    ]

    # ... and annotate regression statistics onto upper left
    at = AnchoredText(
        '\n'.join(info),
        frameon=True,
        loc='upper left',
    )
    ax.add_artist(at)

    # save to file
    # and assert df['Load'] is homogeneous
    plt.savefig(
        kn.pack({
            **{
                'x': slugify(x),
                'y': slugify(y),
                'synchronous': str(synchronous),
                'ext': '.png',
            },
            **extra_names
        }),
        transparent=True,
        dpi=300,
    )
예제 #18
0
def approximate_random_effects(data, labels, group):
    slope_per_donor = np.array([])
    rval_per_donor = np.array([])
    #print "Performing approximate random effect analysis..."
    for donor_id in set(
            data[group]):  #for donor_id in donorids, perform linear regression
        #print "Total usable datapoints of donor %s: %d" % (donor_id, len(list(data[labels[0]][data[group] == donor_id]))) #shows usable datapoints per donor
        slope, _, rval, p_val, stderr = linregress(
            list(data[labels[0]][data[group] == donor_id]),
            list(data[labels[1]][data[group] == donor_id]))
        slope_per_donor = np.append(slope_per_donor, slope)
        rval_per_donor = np.append(rval_per_donor, rval)

    #average_slope = round(slope_per_donor.mean(),6) #get mean r-value across donors
    #average_rval = round(rval_per_donor.mean(),6) #get mean r-value across donors
    average_slope = round(np.nanmean(slope_per_donor),
                          6)  #get mean r-value across donors
    average_rval = round(np.nanmean(rval_per_donor),
                         6)  #get mean r-value across donors
    t_value, p_value = ttest_1samp(
        slope_per_donor,
        0)  #t-test (redundant information for downstream analyses)
    with open(output_file, 'a') as f:  #saving full data to .csv
        w = csv.writer(f)
        #print "Saving the analysis results..."
        w.writerow([
            gene, average_rval, average_slope, rval_per_donor[0],
            rval_per_donor[1], rval_per_donor[2], rval_per_donor[3],
            rval_per_donor[4], rval_per_donor[5], t_value, p_value
        ])

    with open(output_file_GSEA, 'a') as f:  #saving GSEA input data to .csv
        w = csv.writer(f, delimiter='\t')
        #print "Saving to GSEA input file..."
        w.writerow([gene, average_rval])

    #Scatterplot of gene expression against reverse inference fMRI map z-score
    print "Plotting the correlation graph..."
    ax = sns.lmplot(labels[0],
                    labels[1],
                    data,
                    hue=group,
                    legend=True,
                    fit_reg=True)  #comment-out for no plotting
    ax.set(xlabel="%s map z-score value" % (cog_function.capitalize()))
    ax = plot.title(gene)
    print "Saving the correlation graph..."
    plot.savefig(plot_pdf, format='pdf')
    plot.close()
    return
예제 #19
0
파일: post.py 프로젝트: ins-amu/taa-pattern
def get_taa_group_features(t, seeg, coords, tfr, tto):
    mask = (t > np.min(tfr)) * (t < max(np.max(tto), np.min(tfr) + 5.0))
    seeg -= np.mean(seeg, axis=1)[:, None]

    pca = PCA(n_components=PCA_NCOMP)
    comps = pca.fit_transform(seeg.T)
    var_explained = pca.explained_variance_ratio_
    duration = np.mean(tto - tfr)

    line_coords = np.linalg.norm(coords - coords[0], axis=1)
    slope, _, rval, _, _ = stats.linregress(line_coords, tfr)

    return duration, abs(slope), rval**2, var_explained[0], sum(
        var_explained[0:2])
예제 #20
0
def calibrate_data(in_data):
    """
    Takes an input time series containing  ccd (dependent variable,  i.e. predictor), and gauge measurement
    (independent variable,  i.e. predictand), and uses a linear model to derive the calibration parameters.
    :param in_data: input array with four columns of data:  year,  month,  ccd,  rain gauge measurement
    :return: calibration parameters
    """
    #  slice the array to select the gauge and ccd data
    gauge = in_data[:, 3]
    ccd = in_data[:, 2]
    #  derive a linear model using the intrinsic function linregress,  imported from the scipy package
    linear_model = linregress(ccd, gauge)
    a1 = linear_model[0]
    a0 = linear_model[1]
    #  return a tuple containing the calibration parameters
    return a0, a1
예제 #21
0
def approximate_random_effects(data, labels, group):
    correlation_per_donor = {}
    for donor_id in set(data[group]):
        correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]),
                                                                 list(data[labels[1]][data[group] == donor_id]))
    average_slope = np.array(correlation_per_donor.values()).mean()
    t, p_val = ttest_1samp(correlation_per_donor.values(), 0)
    print "Averaged slope across donors = %g (t=%g, p=%g)" % (average_slope, t, p_val)
    sns.violinplot([correlation_per_donor.values()], inner="points", names=["donors"])
    plt.ylabel("Linear regression slopes between %s and %s" % (labels[0], labels[1]))
    plt.axhline(0, color="red")

    sns.lmplot(labels[0], labels[1], data, hue=group, col=group, col_wrap=3)
    plt.show()

    return average_slope, t, p_val
예제 #22
0
def capm(investment, market, risk_free_return=0):
    """Computes historical CAPM paramaters, using log returns, of the investment over the market.
  
  investment -- The daily prices of the investment under analysis.
  market -- The daily prices of the market investment.
  risk_free_return -- The risk-free return over the period of consideration, given as a fraction.
  
  Returns (alpha, beta, r), where r is the r-value."""
    alr = log(1.0 + risk_free_return)
    investment_returns = [log(1.0 * b / a) - alr for (a, b) in zip(investment[0:-1], investment[1:])]
    market_returns = [log(1.0 * b / a) - alr for (a, b) in zip(market[0:-1], market[1:])]
    x = linregress(market_returns, investment_returns)
    beta = x[0]
    alpha = x[1]
    r = x[2]
    return (alpha, beta, r)
예제 #23
0
def comp_Z(se_data):
    ulist = np.unique(se_data[:,1])
    max_points = 3
    Z = []
    for u in ulist:
        ui =  np.where(se_data[:,1] == u)
        #find lowest available temperatures
	ii = np.argsort(se_data[ui][:,0])
        d = se_data[ui][ii][-max_points:]	# list of lowest temperatur for given U
	w0l = np.pi/d[:,0]			# zero Matsubara Frequency
	dRSigma = d[:,2]/w0l			# approximation for the derivative of SE at w=0
	res = stats.linregress(1./np.array(d[:,0]), dRSigma)
	rr = unc.ufloat(res.intercept,res.stderr)
	rr = (1./(1.-rr))
	Z.append([u,rr.n,rr.std_dev])
    return np.array(Z)
예제 #24
0
def _test1():
    np.random.seed(0)
    x = np.linspace(0., 10., 41)
    y1 = 2. - 1.5 * x  # (2,-1)
    y2 = 2. * x - 5.  # (3, 1)
    y3 = -x + 4.  # (5, -1)
    y4 = 2. * x - 11.
    y = np.array(x)
    y[np.where(x < 2)] = y1[np.where(x < 2)]
    y[np.where((x >= 2) & (x < 3))] = y2[np.where((x >= 2) & (x < 3))]
    y[np.where((x >= 3) & (x < 5))] = y3[np.where((x >= 3) & (x < 5))]
    y[np.where(5 <= x)] = y4[np.where(5 <= x)]
    # plot(x, y, 'o')
    # show()
    n = len(x)
    var_x0 = np.var(x[:-1]) * (n - 1.)
    var_y0 = np.var(y[:-1]) * (n - 1.)
    mean_x = np.mean(x[:-1]) + (x[-1] - np.mean(x[:-1])) / n
    mean_y = np.mean(y[:-1]) + (y[-1] - np.mean(y[:-1])) / n
    dx = x[-1] - mean_x
    dy = y[-1] - mean_y
    _assert_eq(np.var(x) * n, _update_var(var_x0, n, dx))
    _assert_eq(np.var(y) * n, _update_var(var_y0, n, dy))
    beta0 = np.cov(x[:-1], y[:-1], bias=True)[0][1] / np.var(x[:-1])
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    # print(slope)
    # print(np.cov(x, y, bias=True)  [0][1] / np.var(x))
    print('slope exact = {}, computed = {}'.format(
        slope, _update_beta(beta0, n, dx, dy, var_x0,
                            np.var(x) * n)))
    print('intercept exact = {}, computed = {}'.format(intercept, mean_y -
                                                       slope * mean_x))
    segs = seg_lin_reg(x, y, 0.0001)
    assert len(segs) == 4
    _assert_eq(segs[0][1], 2.), _assert_eq(segs[0][2], -1.5)
    _assert_eq(segs[1][1], -5.), _assert_eq(segs[1][2], 2.)
    _assert_eq(segs[2][1], 4.), _assert_eq(segs[2][2], -1.)
    _assert_eq(segs[3][1], -11.), _assert_eq(segs[3][2], 2.)

    plot_segments(x, y, 0.0001)

    # test spikes
    y[17] = 2
    y[7] = 0
    y[-1] = 7
    y[-2] = 6
    plot_segments(x, y, 0.0001)
예제 #25
0
    def analyze(self, gaps: Sequence, mlc: MLC, y_field_size: float = 100, profile_width=10):
        """Analyze an EPID image with varying MLC overlaps to determine the DLG.

        Parameters
        ----------
        gaps
            The gaps (i.e. overlap) of the leaves in mm.
            These should typically be in descending order and also be negative. E.g. (-1, ..., -2.2).

        mlc
            The MLC type/arrangement. This lets us know where the leaf centers are to take a profile along.

        y_field_size
            The field size along the y-dimension (perpendicular to the leaf travel). This will determined which leaves
            are associated with which gap.

        profile_width
            The width of the profile to take along the axes parallel to leaf motion. This should be a good bit wider
            than the gap values. The default is reasonable and it is unlikely it needs tweaking.
        """
        measured_dlg_per_leaf = []
        planned_dlg_per_leaf = []
        mlc = mlc.value['arrangement']
        g = list(gaps)
        g.sort()
        profile_width_px = round(self.image.dpmm * profile_width)
        mid_width = self.image.shape[1] / 2
        mid_height = self.image.shape[0] / 2
        for idx, center in enumerate(mlc.centers):
            if -y_field_size / 2 < center < y_field_size / 2:
                # get the pixel window area
                center_px = center * self.image.dpmm
                width_px = mlc.widths[idx] / 4 * self.image.dpmm
                top = ceil(mid_height + center_px + width_px)
                bottom = floor(mid_height + center_px - width_px)
                # sample the window and take the average perpendicular to MLC motion
                window = self.image[bottom:top, int(mid_width - profile_width_px):int(mid_width + profile_width_px)]
                width = self._determine_measured_gap(window.mean(axis=0))
                planned_dlg_per_leaf.append(self._get_dlg_offset(y_field_size, center, g))
                measured_dlg_per_leaf.append(width)
        # fit the data to a line and determine the DLG from the 0 intercept
        lin_fit = stats.linregress(planned_dlg_per_leaf, measured_dlg_per_leaf)
        dlg = lin_fit.intercept / lin_fit.slope
        self._lin_fit = lin_fit
        self.measured_dlg = dlg
        self.planned_dlg_per_leaf = planned_dlg_per_leaf
        self.measured_dlg_per_leaf = measured_dlg_per_leaf
예제 #26
0
def approximate_random_effects(data, labels, group):

    correlation_per_donor = {}
    for donor_id in set(data[group]):
        correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]),
                                                       list(data[labels[1]][data[group] == donor_id]))
    average_slope = np.array(correlation_per_donor.values()).mean()
    t, p_val = ttest_1samp(correlation_per_donor.values(), 0)
    print "Averaged slope across donors = %g (t=%g, p=%g)"%(average_slope, t, p_val)
    sns.violinplot([correlation_per_donor.values()], inner="points", names=["donors"])
    plt.ylabel("Linear regression slopes between %s and %s"%(labels[0],labels[1]))
    plt.axhline(0, color="red")
    
    sns.lmplot(labels[0], labels[1], data, hue=group, col=group, col_wrap=3)
    plt.show()
    
    return average_slope, t, p_val
예제 #27
0
def fit_exp_f(y, x):
    """
    Returns parameters A and B that would fit an exponential
    function of y = A*e^(Bx)
    Parameters
    ----------
        y: pd.Series
            Variable y in the formula    
        x: pd.Series
            Variable x in the formula
    Returns
    -------
        Parameters A and B
    """    

    ## Fit with y = Ae^(Bx) -> logy = logA + Bx
    # Returns A and B of a function as: y = A*e^(Bx)
    B, logA, r_value, p_value, std_err = linregress(transpose(x.values), log(y))
    
    return exp(logA), B
예제 #28
0
 def test_multinomial_elementwise_distribution(self):
     '''Verify that the created variables approach a multinomial distribution for large numbers
     of samples.'''
     (m, n, k) = (6, 5, 1)
     r = 2**np.arange(4, 17)
     p = statutil.random_row_stochastic((m, n))
     #p = statutil.scale_row_sums(np.ones((m, n)))
     error = np.zeros((len(r), ))
     for (i, r_val) in enumerate(r):
         for _ in xrange(k):
             x = statutil.multinomial_elementwise(p, r_val)
             # Root-mean-square-error of observed frequencies w.r.t. desired frequencies
             error[i] += statutil.norm_frobenius_scaled(
                 statutil.hist(x, n) / (1.0 * r_val) - p)
         error[i] /= (1.0 * k)
     # Validate the model error of the central limit theorem: C*r^(-0.5).
     # This is a consequence of the Central Limit Theorem. We are making k experiments for
     # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations
     # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each
     # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors
     # should be (with 95% confidence) <= n * (1.96*s[i])^2. So
     # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m).
     # See http://en.wikipedia.org/wiki/Central_limit_theorem
     alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error))
     c = np.exp(c)
     #        print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) -
     #                                                          np.sum(p * np.arange(p.shape[1]), axis=1) ** 2,
     #                                                          2) / np.sqrt(p.shape[0]),
     assert_almost_equal(alpha,
                         -0.5,
                         decimal=1,
                         err_msg='Unexpected error term growth power')
     self.assertTrue(
         c <= 1.96 * np.linalg.linalg.norm(
             np.sum(p * np.arange(p.shape[1])**2, axis=1) -
             np.sum(p * np.arange(p.shape[1]), axis=1)**2, 2) /
         np.sqrt(p.shape[0]),
         'Error term coefficient outside 95% confidence interval')
     self.assertTrue(
         abs(r_value) > 0.99,
         'Error does not fit a power law in sample size')
예제 #29
0
    def calculate_histogram(self):
        self.calculating_histograms = True
        locations = self.locations.copy()
        t1 = tp.filter_stubs(locations, self.config['process']['min_traj_length'])
        # t2 = t1[((t1['mass'] > self.config['process']['min_mass']) & (t1['size'] < self.config['process']['max_size']) &
        #          (t1['ecc'] < self.config['process']['max_ecc']))]
        im = tp.imsd(t1, self.config['process']['um_pixel'], self.config['process']['fps'])
        self.histogram_values = []
        for pcle in im:
            if general_stop_event.is_set():
                break

            data = im[pcle]
            t = data.index[~np.isnan(data.values)]
            val = data.values[~np.isnan(data.values)]
            try:
                slope, intercept, r, p, stderr = stats.linregress(np.log(t), np.log(val))
                self.histogram_values.append([slope, intercept])
            except:
                pass
        self.calculating_histograms = False
        self.publisher.publish('histogram', self.histogram_values)
예제 #30
0
def recalculate_line(consensus, is_vertical):
    """Given a discovered consensus, recalculate the line with other points that are close enough.
    
    Args:
        consensus (list): List of consensus measurements.
        is_vertical (bool): Whether the line is almost vertical.
    
    Returns:
        tuple: Start and end points of line segment.
    """
    cartesian_consensus = numpy.array([point.location for point in consensus])
    # If almost vertical, calculate line in terms of y.
    if is_vertical:
        cartesian_consensus = numpy.fliplr(cartesian_consensus)

    # Calculate regression line.
    slope, intercept, r_, p_, e_ = stats.linregress(cartesian_consensus[:, 0],
                                                    cartesian_consensus[:, 1])

    start = util.nearest(cartesian_consensus[0], slope, intercept)
    end = util.nearest(cartesian_consensus[0], slope, intercept)
    distance = 0

    for i in range(len(consensus)):
        for j in range(i + 1, len(consensus)):
            point_a = util.nearest(cartesian_consensus[i], slope, intercept)
            point_b = util.nearest(cartesian_consensus[j], slope, intercept)
            new_dist = util.dist(point_a, point_b)
            if new_dist > distance:
                distance = new_dist
                start = point_a
                end = point_b

    # If line is vertical, flip coordinates back.
    if is_vertical:
        start = numpy.flipud(start)
        end = numpy.flipud(end)

    return start, end
예제 #31
0
def get_slopes(symbol_list, fund_type):

    # For each fund, I perform a simple least-squares linear regression
    # to get the value as a function of time.

    # Here, I also restrict the analysis to only those funds which have
    # gained value over the past five years (i.e. have a positive slope).
    # The logic behind this is that, if we're only adding one fund to the
    # portfolio, we can limit ourselves to choosing one that has
    # historically done well. The question is then whether the US bonds
    # that have done well have done better than the emerging market funds
    # that have done well.

    slopes = []
    for symbol in symbol_list:
        slope = stats.linregress(parse_csv(symbol, fund_type))[0]
        if slope > 0.0:
            slopes.append(slope)


#    print(len(slopes))
    return slopes
def decompose(_data, _plots = False):
	'''
			Function to decompose a signal into it's trend and normal variation
			Input:
				_data: signal to decompose
				_plots: print plots or not (default False)
			Output:
				DataDecomp = _data - slope*_data.index
				slope, intercept = linear regression coefficients
	'''
	indexDecomp = np.arange(len(_data))

	slope, intercept, r_value, p_value, std_err = linregress(indexDecomp, np.transpose(_data.values))
	dataDecomp=pd.DataFrame(index = _data.index)
	name = _data.name
	result = []
	
	for n in range(len(_data)):
		result.append(float(_data.values[n]-slope*n))
	dataDecomp[(name + '_' + '_flat')] = result
	
	trend = slope*indexDecomp + intercept
	if _plots == True:
		
		with plt.style.context('seaborn-white'):
			fig, ax = plt.subplots(figsize=(20,10))
			ax.plot(_data.index, _data.values, label = "Actual", marker = None)
			ax.plot(_data.index, dataDecomp[(name + '_' +'_flat')], marker = None, label = 'Flattened')
			ax.plot(_data.index, trend, label = 'Trend')
			ax.legend(loc="best")
			ax.axis('tight')
			ax.set_title("Signal Decomposition - "+ name)
			ax.set_xlabel('Index')
			ax.set_ylabel('Signal')
			ax.grid(True)

			plt.show()
			
	return dataDecomp, slope, intercept
예제 #33
0
    def calculate_histogram(self):
        """ Starts a new thread to calculate the histogram of fit-parameters based on the mean-squared displacement of
        individual particles. It publishes the data on topic `histogram`.

        .. warning:: This method is incredibly expensive. Since it runs on a thread it can block other pieces of code,
        especially the GUI, which runs on the same process.

        .. TODO:: The histogram loops over all the particles. It would be better to skeep particles for which there is
            no new data

        .. TODO:: Make this method able to run on a separate process. So far is not possible because it relies on data
            stored on the class itself (`self.locations`).
        """
        self.calculating_histograms = True
        locations = self.locations.copy()
        t1 = tp.filter_stubs(locations,
                             self.config['process']['min_traj_length'])
        t2 = t1[((t1['mass'] > self.config['process']['min_mass']) &
                 (t1['size'] < self.config['process']['max_size']) &
                 (t1['ecc'] < self.config['process']['max_ecc']))]
        im = tp.imsd(t2, self.config['process']['um_pixel'],
                     self.config['process']['fps'])
        self.histogram_values = []
        for pcle in im:
            if general_stop_event.is_set():
                break

            data = im[pcle]
            t = data.index[~np.isnan(data.values)]
            val = data.values[~np.isnan(data.values)]
            try:
                slope, intercept, r, p, stderr = stats.linregress(
                    np.log(t), np.log(val))
                self.histogram_values.append([slope, intercept])
            except:
                pass
        self.calculating_histograms = False
        self.publisher.publish('histogram', self.histogram_values)
예제 #34
0
    def linregress_hb_drop_with_time_to_previous_rbc(self,
                                                     si,
                                                     threshold=np.inf):
        """
        Compute a linear regression of each RBCs hemoglobin saturation drop with the time difference
        to the previous RBC.

        Args:
            si (int): segment index
            threshold (float): maximum value of time difference used for the linear regression

        Returns:
            float tuple, return value of scipy.stats.linregress

        """
        time_difference = self.rbcDataPostProcessor.timeToPreviousRBC(
            si, self.n_rbc_average(si))
        hb_drop = self.hb_difference(si)[1:]
        filtered_times = time_difference[time_difference < threshold]
        filtered_drops = hb_drop[time_difference < threshold]
        if filtered_times.size:
            return linregress(filtered_times, filtered_drops)
        else:
            return np.nan, np.nan, np.nan, np.nan
예제 #35
0
파일: Analyzer.py 프로젝트: pleger/CityCA
    def getLinearRegressionData(self, log):
        values = self.createValuesForRegression(log)

        slope, intercept, r_value, p_value, std_err = linregress(values[0], values[1])
        return [slope,intercept,r_value*r_value]
예제 #36
0
파일: zorro_util.py 프로젝트: C-CINA/zorro
def fit( x, y, funchandle='gauss1', estimates=None ):
    """ Returns: fitstruct,  fitY, Rbest """
    from scipy.optimize import curve_fit 
    from scipy.stats.stats import linregress

    if funchandle == 'gauss1':
        def fitfunc( x, a1, b1, c1 ):
            return a1 * np.exp( -( (x-b1)/ c1)**2 )
        # Really arbitrary c1 estimate at basically 25 pixels..
        if estimates is None:
            estimates = np.array( [np.max(y), x[np.argmax(y)], 25.0*(x[1]-x[0]) ] )
        
    elif funchandle == 'poly1':
        def fitfunc( x, a1, b1 ):
            return a1 * x + b1
        if estimates is None:
            slope = (np.max(y)-np.min(y))/(np.max(x)-np.min(x))
            intercept = np.min(y) - slope*x[np.argmin(y)]
            estimates = [slope, intercept]
    elif funchandle == 'poly2':
        def fitfunc( x, a1, b1, c1 ):
            return a1 * x **2.0 + b1 *x + c1
        if estimates is None:
            slope = (np.max(y)-np.min(y))/(np.max(x)-np.min(x))
            intercept = np.min(y) - slope*x[np.argmin(y)]
            estimates = [0.0, slope, intercept]
    elif funchandle == 'poly3':
        def fitfunc( x, a1, b1, c1, d1 ):
            return a1 * x **3.0 + b1 *x**2.0 + c1*x + d1
        if estimates is None:
            slope = (np.max(y)-np.min(y))/(np.max(x)-np.min(x))
            intercept = np.min(y) - slope*x[np.argmin(y)]
            estimates = [0.0, 0.0, slope, intercept]
    elif funchandle == 'poly5':
        def fitfunc( x, a1, b1, c1, d1, e1, f1 ):
            return a1 * x **5.0 + b1 *x**4.0 + c1*x**3.0 + d1*x**2.0 + e1*x + f1
        if estimates is None:
            slope = (np.max(y)-np.min(y))/(np.max(x)-np.min(x))
            intercept = np.min(y) - slope*x[np.argmin(y)]
            estimates = [0.0, 0.0, 0.0, 0.0, slope, intercept]
    elif funchandle == 'abs1':
        def fitfunc( x, a1 ):
            return a1 * np.abs( x )
        if estimates is None:
            estimates = np.array( [ (np.max(y)-np.min(y))/(np.max(x)-np.min(x))])
    elif funchandle == 'exp':
        def fitfunc( x, a1, c1 ):
            return a1 * np.exp( c1*x )
        if estimates is None:
            estimates = np.array( [1.0, -1.0] )
    elif funchandle == 'expc':
        def fitfunc( x, a1, c1, d1 ):
            return a1 * np.exp( c1*x ) + d1
        if estimates is None:
            estimates = np.array( [1.0, -1.0, 1.0] )
    elif funchandle == 'power1':
        def fitfunc( x, a1, b1 ):
            return a1*(x**b1)
        if estimates is None:
            estimates = np.array( [1.0, -2.0] )   
    elif funchandle == 'power2':
        def fitfunc( x, a1, b1, c1 ):
            return a1*(x**b1) + c1
        if estimates is None:
            estimates = np.array( [1.0, -2.0, 1.0] )    
    elif funchandle == 'powerpoly1':
        def fitfunc( x, a1, b1, a2, c1 ):
            return a1*(x**b1) + a2*x + c1
        if estimates == None:
            estimates = np.array( [1.0, -2.0, 0.0,  1.0] )
    else:
        fitfunc = funchandle
        
    try:
        fitstruct, pcov = curve_fit( fitfunc, x, y, p0=estimates )
        perr = np.sqrt(np.diag(pcov))
        print( "Fitting completed with perr = " + str(perr) )
        fitY = fitfunc( x, *fitstruct )
        goodstruct = linregress( x, fitfunc( x, *fitstruct ) )
        Rbest = goodstruct[2]
    except RuntimeError:
        print( "RAM: Curve fitting failed")
        return
    return fitstruct,  fitY, Rbest
예제 #37
0
# Remove Visual count 0 class
#dfCS2 = dfCS[dfCS['JNM']!=0]
#dfCS2 = dfCS1

# -----------------------------------------------
#### see zero counts Camille with large count software
dfCS0 = dfCS[dfCS['JNM']==0]
#### Query 0 count for me and count for C
dfCSM0 = dfCS2[dfCS2['SoftC']==0]
# --------------------------------------------------

# Create lists for corellation
camC1 = list(dfCS3['J'])
objc = list(dfCS3['SoftC'])
               
slopeO, interceptO, r_valueO, p_valueO, std_errO = linregress(camC1,objc)
print "r squared count = ",r_valueO**2
r_valueO = r_valueO**2
print "slope",slopeO
print "p-value",p_valueO


# plot raw corellation
plt.scatter(camC1,objc)
#plt.title('Obj count: erode = %s, dilate = %s, thres = %s'%(er,dil,thres))
plt.xlabel('Visual count')
plt.ylabel('Software count')
#pylab.savefig(resultsdir + 'ObjCount-' + str(count) + '.pdf',bbox_inches='tight')
plt.show()

# plot mean with sd
예제 #38
0
    def test_regression_of_returns_factor(self, returns_length, regression_length):
        """
        Tests for the built-in factor `RollingLinearRegressionOfReturns`.
        """
        my_asset_column = 0
        start_date_index = 6
        end_date_index = 10

        assets = self.asset_finder.retrieve_all(self.sids)
        my_asset = assets[my_asset_column]
        my_asset_filter = AssetID() != (my_asset_column + 1)
        num_days = end_date_index - start_date_index + 1

        # The order of these is meant to align with the output of `linregress`.
        outputs = ["beta", "alpha", "r_value", "p_value", "stderr"]

        # Our regression factor requires that its target asset is not filtered
        # out, so make sure that masking out our target asset does not take
        # effect. That is, a filter which filters out only our target asset
        # should produce the same result as if no mask was passed at all.
        for mask in (NotSpecified, my_asset_filter):
            regression_factor = RollingLinearRegressionOfReturns(
                target=my_asset, returns_length=returns_length, regression_length=regression_length, mask=mask
            )
            results = self.engine.run_pipeline(
                Pipeline(columns={output: getattr(regression_factor, output) for output in outputs}),
                self.dates[start_date_index],
                self.dates[end_date_index],
            )
            output_results = {}
            expected_output_results = {}
            for output in outputs:
                output_results[output] = results[output].unstack()
                expected_output_results[output] = full_like(output_results[output], nan)

            # Run a separate pipeline that calculates returns starting 2 days
            # prior to our start date. This is because we need
            # (regression_length - 1) extra days of returns to compute our
            # expected regressions.
            returns = Returns(window_length=returns_length)
            results = self.engine.run_pipeline(
                Pipeline(columns={"returns": returns}),
                self.dates[start_date_index - (regression_length - 1)],
                self.dates[end_date_index],
            )
            returns_results = results["returns"].unstack()

            # On each day, calculate the expected regression results for Y ~ X
            # where Y is the asset we are interested in and X is each other
            # asset. Each regression is calculated over `regression_length`
            # days of data.
            for day in range(num_days):
                todays_returns = returns_results.iloc[day : day + regression_length]
                my_asset_returns = todays_returns.iloc[:, my_asset_column]
                for asset, other_asset_returns in todays_returns.iteritems():
                    asset_column = int(asset) - 1
                    expected_regression_results = linregress(y=other_asset_returns, x=my_asset_returns)
                    for i, output in enumerate(outputs):
                        expected_output_results[output][day, asset_column] = expected_regression_results[i]

            for output in outputs:
                assert_frame_equal(
                    output_results[output],
                    DataFrame(
                        expected_output_results[output],
                        index=self.dates[start_date_index : end_date_index + 1],
                        columns=assets,
                    ),
                )
예제 #39
0
meanGroundedTime = []
for robot in dataset[0]:
	robotGroundedTimesteps = []
	for row in robot:
		rowGrounded = 1 if 1 in row else 0;
		robotGroundedTimesteps.append(rowGrounded);
	meanGroundedTime.append(np.mean(robotGroundedTimesteps));

print(np.mean(meanGroundedTime), np.std(meanGroundedTime), min(meanGroundedTime), max(meanGroundedTime));

rp = pearsonr(meanGroundedTime, dataset[REWARD_SIGNALS]);
print(rp);
rs = spearmanr(meanGroundedTime, dataset[REWARD_SIGNALS]);
print(rs);
lr = linregress(meanGroundedTime, dataset[REWARD_SIGNALS]);
print(lr);

fit = np.polyfit(meanGroundedTime, dataset[REWARD_SIGNALS], 1);

print(fit);
fitfn = np.poly1d(lr[0:2]);

plt.plot(meanGroundedTime, dataset[REWARD_SIGNALS], 'go', np.arange(.4, 1.1, .01), fitfn(np.arange(.4, 1.1, .01)), '--k');

plt.title("Proportion Time Grounded and Normalized Reinforcement Signal" 
		+ "\n For " + robotType[0].upper() + robotType[1:] + " Robot with \"jump\" Command");
plt.ylabel("Normalized Reinforcement Signal");
plt.xlabel("Proportion of Time Grounded");
plt.axis([.45, 1.05, -1.1, 1.1]);
plt.show();
예제 #40
0
        model_sum.append(model_hour_sum)
        validation_sum.append(validation_hour_sum)

    model_dump.append(np.nansum(model_sum))
    validation_dump.append(np.nansum(validation_sum))

err = np.abs(np.array([validation_dump]) - np.array([model_dump]))
err_av = np.mean(err)
sum1 = np.sum(model_dump)
print('Difference between valid and model =', err)
print('Average daily error =', err_av)
print('Sum of model LWD =', sum1)
print('Sum of valid LWD =', np.sum(validation_dump))

slope, intercept, r_value, p_value, std_err = \
linregress(model_dump,validation_dump)
print('slope =', slope)
print('intercept =', intercept)

counter1 = np.zeros_like(model_dump)
counter2 = np.zeros_like(model_dump)
counter5 = np.zeros_like(model_dump)

# Counters for number of model LWD that are between 1, 2 and 5 hours of
# validation LWD
for i in range(len(model_dump)):
    if (model_dump[i]<=validation_dump[i]+1 and \
        model_dump[i]>=validation_dump[i]-1):
        counter1[i] = 1

    if (model_dump[i]<=validation_dump[i]+2 and \
예제 #41
0
len(histDic1)
plt.scatter(histDic1.keys(),histDic1.values())
plt.hist(histDic1.values())

histDic2=[np.count_nonzero((binwidth-tol < testBin) & (testBin < binwidth+tol)) for binwidth in range(100)]
len(histDic2)

histDicT=bInt(binSs,0.02)
len(histDicT)
plt.hist(histDicT.values(),bins=sample)#np.arange(min(dataD), max(dataD) + binwidth, binwidth))

from scipy.stats.stats import linregress
for value in binSs[:6]:
    print(value)
for index in range(0, len(binSs[:sample]) - 6):
    s, intercept, r, p, std_error = linregress(binSs[index:index + 7], binSs[index:index + 7])
    print(s, intercept, r, p, std_error,"\n")
for value in histDicT:
    temp = histDicT[value]
    print(value,temp)#,histDicT[0],histDicT[1],histDicT[2])
histDicT=histDic1
histDicT=bInt(binS,0.02)
import itertools
slope1=[]
slope1pos=[]
for index in range(0, len(histDicT) - 6):
        tempD=dict(itertools.islice(histDicT.items(), index,index + 7))
        #print(list(tempD.values()))
        s, intercept, r, p, std_error = linregress(list(tempD.keys()), list(tempD.values()))
        #print(index,tempD,s, intercept, r, p, std_error)
        slope1.append(s)
        """
        roi_data_mean = np.ones(len(names))*-99
        roi_data_std = np.ones(len(names))*-99
        roi_data_r = np.ones(len(names))*-99
        roi_data_p = np.ones(len(names))*-99
        roi_data_m = np.ones(len(names))*-99
            
        for i, name in enumerate(names):
            #wm_name = 'wm-' + hemi + '-' + name
            wm_name = '{}_{}'.format(hemi, name)
    
            if wm_name in df1.columns:
                df_merge = df1.merge(df2, on='nspn_id')
                roi_data_mean[i] = df1[wm_name].mean()
                roi_data_std[i] = df1[wm_name].std()
                m, c, r, p, sterr = linregress(df_merge[wm_name + '_x'], df_merge[wm_name + '_y'])
                roi_data_m[i] = m
                roi_data_r[i] = r
                roi_data_p[i] = 1 - p

        
        """
        Make a vector containing the data point at each vertex.
        """
        vtx_data_mean = roi_data_mean[labels]
        vtx_data_std = roi_data_std[labels]
        vtx_data_r = roi_data_r[labels]
        vtx_data_p = roi_data_p[labels]
        vtx_data_m = roi_data_m[labels]

        """
예제 #43
0
  d2 = hpcs.double[t2,t1]

  naive   = s1['instructions'] + s2['instructions']
  actual  = d1['instructions'] + d2['instructions']
  degr = actual / naive
  degradations += [degr]

  for k, v1 in s1.items():
    v2 = s2[k]
    total = v1 + v2
    counters[k] += [total]


  # total = gettotal(shpc1, shpc2, ['LLC-stores', 'LLC-loads'])
  total = gettotal(s1, s2, ['instructions'])
  plotdata[total] = degr

for counter,v in counters.items():
  cor, pv = pearsonr(v, degradations)
  if pv < 0.1:
    print ("{:25} {: .3f} {:2.1%}".format(counter, cor, pv*100))


if plotdata:
  X = sorted(list(plotdata.keys()))
  Y = [plotdata[x] for x in X]
  print(linregress(X,Y))
  p.xlabel("counters")
  p.ylabel("degradatation")
  p.plot(X, Y, '-o')
  p.show()
def main():
    usage = 'usage: %prog [opt] lfq_filename gene_exprs_filename output_filename'\
        '\nThree arguments must be specified in command line:\n'\
        '1) LFQ filename, containing LFQ intensities and two replicates.\n'\
        '2) Gene exprs filename, read count data.\n'\
        '3) AS status of genes (miso output)\n'
    parser = OptionParser(usage=usage)
    
    # colnames for lfq data
    parser.add_option('--lfq_gene_colname', dest='lfq_gene_colname',
                      default='Gene names',
                      help='Column name of gene name')
    parser.add_option('--samp1_lfq_colname1', dest='samp1_lfq_colname1',
                      default='LFQ intensity T331_1',
                      help='Column name of LFQ intensity, sample 1 replicate 1.')
    parser.add_option('--samp1_lfq_colname2', dest='sampl1_lfq_colname2',
                      default='LFQ intensity T331_2',
                      help='Column name of LFQ intensity, sample 1 replicate 2.')
    parser.add_option('--samp2_lfq_colname1', dest='sampl2_lfq_colname1',
                      default='LFQ intensity R_1',
                      help='Column name of LFQ intensity, sample 2 replicate 1.')
    parser.add_option('--samp2_lfq_colname2', dest='sampl2_lfq_colname2',
                      default='LFQ intensity R_2',
                      help='Column name of LFQ intensity, sample 2 replicate 2.')  
    # colnames for gene exprs data
    parser.add_option('--mrna_gene_colname', dest='mrna_gene_colname',
                      default='gene_name',
                      help='Column name for mRNA exprs data.')
    parser.add_option('--samp1_exprs_colname', dest='samp1_exprs_colname',
                      default='LTL331',
                      help='Column name of gene exprs for sample 1')  
    parser.add_option('--samp2_exprs_colname', dest='samp2_exprs_colname',
                      default='LTL331_R',
                      help='Column name of gene exprs for sample 2')
    parser.add_option('--spliced_only', dest='spliced_only',
                      default='False',
                      help='True or False. True shows only spliced genes. '\
                        'False shows all. Default is False.')
    parser.add_option('--convert_to_log2', dest='convert_to_log2',
                      default='True',
                      help='True or False, converts mRNA exprs data to to log2'\
                        ' scale. Default True.')
    parser.add_option('--title', dest='title',
                      default='Plot title',
                      help='Title of plot.')
    parser.add_option('--xlabel', dest='xlabel',
                      default='x axis',
                      help='X axis label of plot')
    parser.add_option('--ylabel', dest='ylabel',
                      default='y axis',
                      help='Y axis label of plot')
    parser.add_option('--annotate_genes', dest='annotate_genes',
                      default=None,
                      help='CSV list of genes to be annotated.\n'\
                        'Default is None, allowing mouse click annotation.')
    (options, args) = parser.parse_args()
    
    if len(args) < 3:
        print 'Not enough args specified.\n%s' %usage
        sys.exit()
    
    lfq_filename = args[0]
    gene_exprs_filename = args[1]
    miso_filename = args[2]
    
    # parse options
    # splicing only option
    spliced_only = options.spliced_only
    if spliced_only in ['True', 'true', 'T', 'TRUE']:
        spliced_only = True
    elif spliced_only in ['False', 'false', 'F', 'FALSE']:
        spliced_only = False
    else:
        print 'Spliced only option must be True or False. %s found.' \
            %spliced_only
        sys.exit()
    print 'splicing_only: %s' %spliced_only
    # log2 conversion option
    convert_to_log2 = options.convert_to_log2
    if convert_to_log2 in ['True', 'T']:
        convert_to_log2 = True
    elif convert_to_log2 in ['False', 'F']:
        convert_to_log2 = False
    else:
        print '--convert_to_log2 must be True or False. %s found.'\
            %convert_to_log2
    print 'Convert to log2: %s' %convert_to_log2
    # xlabel, ylabel, title options
    xlabel = options.xlabel
    ylabel = options.ylabel
    title = options.title
    # annotate genes options
    if options.annotate_genes is not None:
        annotated_gene_list = options.annotate_genes.split(',')
    else:
        annotated_gene_list = options.annotate_genes
    
    
    lfq_mrna_dic = {}
    
    # Add LFQ information to dic
    lfq_mrna_dic = index_lfq_data(lfq_filename, lfq_mrna_dic, options, 
                                  filter_out_missing_data=True)
    print 'lfq data indexed from file: %s' %lfq_filename
    
    # Add gene exprs to dic
    lfq_mrna_dic = index_mrna_data(gene_exprs_filename, lfq_mrna_dic, options,
                                   filter_na=True, 
                                   convert_to_log2=convert_to_log2)
    print 'mrna data indexed from file: %s' %gene_exprs_filename
    
    # Write dic to file
    # write_lfq_mrna_data_to_file(lfq_mrna_dic, out_filename, options)
    
    # Get differentially spliced genes (non-redundant only)
    spliced_genes = list(set(get_spliced_genes(miso_filename)))
    print '%s spliced genes extracted from %s' %(len(spliced_genes), 
                                                 miso_filename)
    
    # Calculate Pearson and Spearman correlation for non-AS genes and AS genes
    
    # Create x and y vectors for spliced, nonspliced and both
    spliced_mrna_log2_fc, spliced_lfq_diff = \
        split_by_splice_status(lfq_mrna_dic, spliced_genes, spliced=True)
    non_spliced_mrna_log2_fc, non_spliced_lfq_diff = \
        split_by_splice_status(lfq_mrna_dic, spliced_genes, spliced=False)
    mrna_log2_fc, lfq_diff = \
        split_by_splice_status(lfq_mrna_dic, spliced_genes, spliced=None)
    # Calculate r and pvals for Pearson

    for mrna_diff_vector, \
        lfq_diff_vector, \
        splice_status in \
            zip([spliced_mrna_log2_fc, non_spliced_mrna_log2_fc, mrna_log2_fc], 
                [spliced_lfq_diff, non_spliced_lfq_diff, lfq_diff], 
                ['DS Genes', 'Non-DS Genes', 'All Genes']):
        pearsonr, pearsonpval = \
            stats.pearsonr(mrna_diff_vector, lfq_diff_vector)
        print 'Gene set:%s\nPearson coefficient: %s\nPval:%s' \
            %(splice_status, pearsonr, pearsonpval)
        spearmanr, spearmanpval = \
            stats.spearmanr(mrna_diff_vector, lfq_diff_vector)
        print 'Gene set:%s\nSpearman coefficient: %s\nPval:%s' \
            %(splice_status, spearmanr, spearmanpval)
        slope, intercept, r_value, p_value, std_err = stats.linregress(mrna_diff_vector,lfq_diff_vector)
        print 'slope: %s\nintercept: %s\nr_value: %s\nstd_error: %s' %(slope, intercept, r_value, std_err)
        
        # calculate concordants
        concord_count = 0
        all_count = 0
        for mrna, lfq in zip(mrna_diff_vector, lfq_diff_vector):
            if mrna * lfq >= 0:    # means concordant
                concord_count += 1
            all_count += 1
        frac_concord = float(concord_count) / all_count
        print 'Gene set:%s\nConcordance:%s/%s, %s' %(splice_status, concord_count, all_count, frac_concord)
                
        
    # Scatterplot data
    scatter_plot_lfq_mrna(lfq_mrna_dic, spliced_genes, spliced_only=spliced_only,
                          title=title, xlabel=xlabel, ylabel=ylabel,
                          annotated_gene_list=annotated_gene_list)