def test_where(): s = Series(np.random.randn(5)) cond = s > 0 rs = s.where(cond).dropna() rs2 = s[cond] assert_series_equal(rs, rs2) rs = s.where(cond, -s) assert_series_equal(rs, s.abs()) rs = s.where(cond) assert (s.shape == rs.shape) assert (rs is not s) # test alignment cond = Series([True, False, False, True, False], index=s.index) s2 = -(s.abs()) expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index) rs = s2.where(cond[:3]) assert_series_equal(rs, expected) expected = s2.abs() expected.iloc[0] = s2[0] rs = s2.where(cond[:3], -s2) assert_series_equal(rs, expected)
def _safe_cast(name, series: pd.Series): if series.dtype == np.float64: assert series.abs().max() < 1e37, "Max too close to float32 max." return series.astype(np.float32) elif series.dtype == np.int64: if name == "detected": assert series.abs().max() < 128, "Max too close to int8 max." return series.astype(np.int8) else: assert series.abs().max() < 2e9, "Max too close to int32 max." return series.astype(np.int32) else: raise TypeError(f"Unexpected non-int/float column type {series.dtype}")
def li_et_al_2019(kurtosis: pd.Series, normal_period: range = range(50, 150), sigma_interval: float = 2) -> int: """ Li et al. in 2019 used the kurtosis as a classification indicator by computing its mean and standard deviation in the early period of bearing operation. The first prediction time (FPT )was then determined as the point in time where the kurtosis exceeds the 2*std_dev interval. :param kurtosis: kurtosis that is used as the FPT indicator :param normal_period: Range of the period that is representative for normal bearing behaviour. :param sigma_interval: range of deviation that is allowed for the kurtosis :return: index of FPT """ kurtosis_normal = kurtosis[normal_period] mean = kurtosis_normal.mean() std_dev = kurtosis_normal.std() kurtosis = kurtosis - mean kurtosis = kurtosis.abs() kurtosis = np.array(kurtosis) n = kurtosis.size threshold = sigma_interval * std_dev for i in range(150, n): if kurtosis[i - 1] > threshold: if kurtosis[i] > threshold: return i return 0
def make_equal(series: pd.Series, matched: float) -> pd.Series: """ Equally distrubute a series considering a matched value. Lower values than matched value are filtered. :param series: A series which has one or more rows. :param matched: A positive float number. :return: A series that is equally distrubuted. """ check_negative = series.sum() < 0 if check_negative: sorted_ = series.abs().sort_values() else: sorted_ = series.sort_values() per_ = matched / series.size for i, v in enumerate(sorted_): if not v > per_: per_ = (matched - sorted_.iloc[:i + 1].sum()) / (series.size - (i + 1)) else: break sorted_.iloc[i:] = per_ if check_negative: return series.mul(0).add(sorted_).mul(-1) else: return series.mul(0).add(sorted_)
def check_sum(self): """ The method checks to make sure that the probabilities add up to 1 across choices. Inputs: None """ self.num_agents = self.probabilities.shape[0] self.num_choices = self.probabilities.shape[-1] # print 'OLD CUM SUM' #cumsum_across_rows = self.probabilities.cumsum(-1)[:,-1] # print cumsum_across_rows # print 'NEW CUM SUM' cumsum_across_rows = self.probabilities.sum(1) #print self.probabilities.head() #print cumsum_across_rows.head() cumsum_zeros_ind = cumsum_across_rows.isnull() diff_from_unity = Series.abs(cumsum_across_rows - 1) if cumsum_zeros_ind.any(): check_sum_across = diff_from_unity[~cumsum_zeros_ind] < 1e-6 #self.error = True #raw_input("Cumulative sum is zero") else: check_sum_across = diff_from_unity < 1e-6 #self.error = False if not check_sum_across.all(): raise ProbabilityError, """probability values do not add up """ \ """to one across rows"""
def test_mask(): # compare with tested results in test_where s = Series(np.random.randn(5)) cond = s > 0 rs = s.where(~cond, np.nan) assert_series_equal(rs, s.mask(cond)) rs = s.where(~cond) rs2 = s.mask(cond) assert_series_equal(rs, rs2) rs = s.where(~cond, -s) rs2 = s.mask(cond, -s) assert_series_equal(rs, rs2) cond = Series([True, False, False, True, False], index=s.index) s2 = -(s.abs()) rs = s2.where(~cond[:3]) rs2 = s2.mask(cond[:3]) assert_series_equal(rs, rs2) rs = s2.where(~cond[:3], -s2) rs2 = s2.mask(cond[:3], -s2) assert_series_equal(rs, rs2) pytest.raises(ValueError, s.mask, 1) pytest.raises(ValueError, s.mask, cond[:3].values, -s) # dtype changes s = Series([1, 2, 3, 4]) result = s.mask(s > 2, np.nan) expected = Series([1, 2, np.nan, np.nan]) assert_series_equal(result, expected)
def global_filter(X: pd.Series, no_change_window: int = 3, min_value: float = None, max_value: float = None, allow_zero: bool = False, allow_negative: bool = False, copy=True) -> pd.Series: if not isinstance(X, pd.Series): raise ValueError('Input data is expected of pd.Series type') if copy: X = X.copy() time_step = X.index.to_series().diff().min() steps_per_hour = math.ceil(pd.Timedelta('1H') / time_step) start = int(no_change_window * steps_per_hour) changes = X.diff().abs() X[start:] = X[start:].mask( changes.rolling(f'{no_change_window}H').sum() < 1e-3, np.nan) if min_value is not None: X.loc[X < min_value] = np.nan if max_value is not None: X.loc[X > max_value] = np.nan if not allow_zero: X.loc[X <= np.finfo(np.float32).eps] = np.nan if not allow_negative: X.loc[X < 0] = np.nan median = X.median() X.loc[X.abs() > 10 * median] = np.nan return X
def mean_abs_scaling(series: pd.Series, minimum_scale=1e-6): """ Scales a Series by the mean of its absolute value. Returns the scaled Series and the scale itself. """ scale = max(minimum_scale, series.abs().mean()) return series / scale, scale
def bla(inst: Instrument, position: pd.Series): ''' Number of trade block per year / ( 2 * average absolute number of blocks held ) ''' no_trade_per_year = position.ffill().diff().abs().rolling(window=system.n_bday_in_3m).sum() * 4 avg_abs_no_trade = position.abs().rolling(window=system.n_bday_in_3m).sum() * 4 # no_trade_per_year = positions.ffill().diff().abs() # avg_abs_no_trade = positions.abs() t = no_trade_per_year / avg_abs_no_trade # print(t.sum()) return t
def signal_scalar(signal: pd.Series, target_abs_forecast=system.target_abs_forecast): """ :param signal: the input signal to be scaled with to scale with median(abs(signal)) = target_abs_forecast :param target_abs_forecast: scalar :return: pd.Series """ # time series average scaling_factor = target_abs_forecast / signal.abs().ewm(span=system.n_bday_in_year).mean() signal = scaling_factor * signal return signal
class FramePlotting(object): params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie', 'scatter', 'hexbin']] param_names = ['kind'] def setup(self, kind): if kind in ['bar', 'barh', 'pie']: n = 100 elif kind in ['kde', 'scatter', 'hexbin']: n = 10000 else: n = 1000000 self.x = Series(np.random.randn(n)) self.y = Series(np.random.randn(n)) if kind in ['area', 'pie']: self.x = self.x.abs() self.y = self.y.abs() self.df = DataFrame({'x': self.x, 'y': self.y}) def time_frame_plot(self, kind): self.df.plot(x='x', y='y', kind=kind)
def _conflict_by_entity(simulation, of_var_holder, ipp_var, pb_calcul, ipp_output = ipp_output): of_var_series = Series(of_var_holder.array) entity = of_var_holder.entity if entity.is_persons_entity: quimen_series = Series(simulation.get_holder('quimen').array) of_var_series = of_var_series[quimen_series.isin([0, 1])].reset_index(drop = True) ipp_var_series = ipp_output[ipp_var] # print ipp_var # print ipp_var_series # print of_var_series # print "\n" else: quient_series = Series(simulation.get_holder('qui' + entity.symbol).array) quient_0 = quient_series[quient_series == 0] quient_1 = quient_series[quient_series == 1] long = range(len(quient_0)) if len(quient_1) > 0: long = [2 * x for x in long] ipp_var_series = ipp_output.loc[long, ipp_var].reset_index(drop = True) conflict = ((ipp_var_series.abs() - of_var_series.abs()).abs() > threshold) idmen = simulation.get_holder('idmen').array conflict_selection = DataFrame({'idmen': idmen, 'idfoy': simulation.get_holder('idfoy').array}) conflict_men = conflict_selection.loc[conflict[conflict == True].index, 'idmen'].drop_duplicates().values # noqa conflict_foy = conflict_selection.loc[conflict[conflict == True].index, 'idfoy'].drop_duplicates().values # noqa if (len(ipp_var_series[conflict]) != 0): if verbose: print u"Le calcul de {} pose problème : ".format(of_var) print DataFrame({ "IPP": ipp_var_series[conflict], "OF": of_var_series[conflict], "diff.": ipp_var_series[conflict].abs() - of_var_series[conflict].abs(), }).to_string() relevant_variables = _relevant_input_variables(simulation) print relevant_variables input = {} for entity in ['ind', 'men', 'foy']: dic = {} for variable in relevant_variables[entity]: dic[variable] = simulation.get_holder(variable).array input[entity] = DataFrame(dic) print "Variables individuelles associées à ce ménage:" print input['ind'].loc[input['ind']['idmen'].isin(conflict_men)].to_string() # .loc[conflict[conflict == True].index].to_string() if not input['men'].empty: print "Variables associées au ménage:" print input['men'].loc[conflict_men].to_string() if not input['foy'].empty: print "Variables associées au foyer fiscal:" print input['foy'].loc[conflict_foy].to_string() pb_calcul += [of_var]
def log_minmax(column: pd.Series): """ Similar to minmax but with changes values logarithmically before executing minmax. :param pd.Series column: Input series of numbers. :return pd.Series: Series with numbers in range from 0 to 1 according to their weight. """ columns = {} column = column.fillna(0) column = column.abs() column = np.log10(column) columns[column.name] = [min(column[column != -np.inf]), max(column)] column = column - columns[column.name][0] column = column.replace(-np.inf, 0) return column / (columns[column.name][1] - columns[column.name][0])
class FramePlotting: params = [ ["line", "bar", "area", "barh", "hist", "kde", "pie", "scatter", "hexbin"] ] param_names = ["kind"] def setup(self, kind): if kind in ["bar", "barh", "pie"]: n = 100 elif kind in ["kde", "scatter", "hexbin"]: n = 10000 else: n = 1000000 self.x = Series(np.random.randn(n)) self.y = Series(np.random.randn(n)) if kind in ["area", "pie"]: self.x = self.x.abs() self.y = self.y.abs() self.df = DataFrame({"x": self.x, "y": self.y}) def time_frame_plot(self, kind): self.df.plot(x="x", y="y", kind=kind)
def _turn_analog_vec_into_per_frame(self, vec: pd.Series) -> pd.Series: """ Squeezes the input running data from a per-row basis to a per frame. """ window_size = max(len(vec) // self.num_of_frames, 1) mean_data = vec.abs().rolling(window_size).mean() sample_at = np.linspace(window_size - 1, len(vec) - 1, num=self.num_of_frames, dtype=np.uint32) data_per_frame = mean_data[sample_at] assert len(data_per_frame) == self.num_of_frames return data_per_frame
def plot_1d_corr_heatmap(corr: pd.Series, annot=True, fmt='.2f', cmap='coolwarm'): max_corr = corr.abs().max() heatmap_df = pd.DataFrame(corr.sort_values(ascending=False)) plt.subplots(figsize=(1.5, len(corr) // 3.5)) sns.heatmap(heatmap_df, annot=annot, fmt=fmt, cmap=cmap, center=0, vmin=-max_corr, vmax=max_corr)
def infer_vmin_vmax(data:pd.Series, continuous_type="infer"): vmin = None vmax = None # Infer continuous type if continuous_type in ["infer", None]: continuous_type = infer_continuous_type(data) # +/- if continuous_type == "diverging": vmax = data.abs().max() vmin = -vmax # Other if continuous_type == "sequential": vmax = data.max() vmin = data.min() assert all(map(bool, [vmin,vmax])), "`vmin` and `vmax` should not be None at this point. Please check `infer_continuous_type`" return vmin, vmax
def robust_zscore(x: pd.Series, zscore=False): """Robust ZScore Normalization Use robust statistics for Z-Score normalization: mean(x) = median(x) std(x) = MAD(x) * 1.4826 Reference: https://en.wikipedia.org/wiki/Median_absolute_deviation. """ x = x - x.median() mad = x.abs().median() x = np.clip(x / mad / 1.4826, -3, 3) if zscore: x -= x.mean() x /= x.std() return x
class SeriesPlotting: params = [["line", "bar", "area", "barh", "hist", "kde", "pie"]] param_names = ["kind"] def setup(self, kind): if kind in ["bar", "barh", "pie"]: n = 100 elif kind in ["kde"]: n = 10000 else: n = 1000000 self.s = Series(np.random.randn(n)) if kind in ["area", "pie"]: self.s = self.s.abs() def time_series_plot(self, kind): self.s.plot(kind=kind)
class SeriesPlotting(object): params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie']] param_names = ['kind'] def setup(self, kind): if kind in ['bar', 'barh', 'pie']: n = 100 elif kind in ['kde']: n = 10000 else: n = 1000000 self.s = Series(np.random.randn(n)) if kind in ['area', 'pie']: self.s = self.s.abs() def time_series_plot(self, kind): self.s.plot(kind=kind)
def test_mask(): # compare with tested results in test_where s = Series(np.random.randn(5)) cond = s > 0 rs = s.where(~cond, np.nan) assert_series_equal(rs, s.mask(cond)) rs = s.where(~cond) rs2 = s.mask(cond) assert_series_equal(rs, rs2) rs = s.where(~cond, -s) rs2 = s.mask(cond, -s) assert_series_equal(rs, rs2) cond = Series([True, False, False, True, False], index=s.index) s2 = -(s.abs()) rs = s2.where(~cond[:3]) rs2 = s2.mask(cond[:3]) assert_series_equal(rs, rs2) rs = s2.where(~cond[:3], -s2) rs2 = s2.mask(cond[:3], -s2) assert_series_equal(rs, rs2) msg = "Array conditional must be same shape as self" with pytest.raises(ValueError, match=msg): s.mask(1) with pytest.raises(ValueError, match=msg): s.mask(cond[:3].values, -s) # dtype changes s = Series([1, 2, 3, 4]) result = s.mask(s > 2, np.nan) expected = Series([1, 2, np.nan, np.nan]) assert_series_equal(result, expected) # see gh-21891 s = Series([1, 2]) res = s.mask([True, False]) exp = Series([np.nan, 2]) tm.assert_series_equal(res, exp)
def test_mask(): # compare with tested results in test_where s = Series(np.random.randn(5)) cond = s > 0 rs = s.where(~cond, np.nan) tm.assert_series_equal(rs, s.mask(cond)) rs = s.where(~cond) rs2 = s.mask(cond) tm.assert_series_equal(rs, rs2) rs = s.where(~cond, -s) rs2 = s.mask(cond, -s) tm.assert_series_equal(rs, rs2) cond = Series([True, False, False, True, False], index=s.index) s2 = -(s.abs()) rs = s2.where(~cond[:3]) rs2 = s2.mask(cond[:3]) tm.assert_series_equal(rs, rs2) rs = s2.where(~cond[:3], -s2) rs2 = s2.mask(cond[:3], -s2) tm.assert_series_equal(rs, rs2) msg = "Array conditional must be same shape as self" with pytest.raises(ValueError, match=msg): s.mask(1) with pytest.raises(ValueError, match=msg): s.mask(cond[:3].values, -s) # dtype changes s = Series([1, 2, 3, 4]) result = s.mask(s > 2, np.nan) expected = Series([1, 2, np.nan, np.nan]) tm.assert_series_equal(result, expected) # see gh-21891 s = Series([1, 2]) res = s.mask([True, False]) exp = Series([np.nan, 2]) tm.assert_series_equal(res, exp)
def scale_series(series: pd.Series, kwargs: Dict[str, Any]) -> Tuple[pd.Series, int, str]: """Scale a series, if 'scale_y'=True in kwargs. Also adjust ylabel text if that is present in kwargs. Factor is as in 10 ** factor. Returns a tuple: adjusted series, factor, factor-text.""" # do we need to act scale_y = get_selected_item(kwargs, 'scale_y', default=False) scale_x = get_selected_item(kwargs, 'scale_x', default=False) if not scale_x and not scale_y: return series, 0, '' label = 'xlabel' if scale_x else 'ylabel' max = series.abs().max() factor = 0 if max < 1000 else np.floor(np.log10(max) / 3.0) * 3 if factor > 0: if label not in kwargs: kwargs[label] = f'{SCALES[factor].title()}' else: kwargs[label] = f'{kwargs[label]} ({SCALES[factor]})' return (series / (10**factor), factor, SCALES[factor])
def cat_bar_plot(self, data: pd.Series, **kwargs) -> plt.Figure: data = data.abs() fig, ax = plt.subplots(**kwargs) ax.grid(True) ax.xaxis.set_major_formatter(dates.DateFormatter('%B')) total_w = 6 if isinstance(data, pd.Series): ax.set_ylim(0, ceil(data.max() / 100) * 100) ax.bar(data.index, data.values, width=total_w) elif isinstance(data, pd.DataFrame): ax.set_ylim(0, ceil(data.max().max() / 100) * 100) for i, c in enumerate(data): w = total_w / data.shape[1] # offset = -(total_w / (data.shape[1] * 2)) + (i * w) offset = 0 ax.bar(data.index + timedelta(days=int(offset)), data[c].values, width=w) return fig
def GSEA2005(geneset_membership: pd.Series, correlations: pd.Series): ''' Implementation of algorithm described here: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1239896/ :param geneset_membership: (pd.Series) True if in set, False if not, index: all genes :param correlations: (pd.Series) Correlation of a given gene :return (Tuple[np.array, np.array]) x and y arrays ready to be plotted. ES = y.max() ''' r_j = correlations.abs().sort_values( ascending=False) # r_j: correlation of gene_j in ranked order S = geneset_membership[ correlations.index] # S: geneset mask aligned with r_j N = S.count() # N: number of genes N_H = S.sum() # N_H: number of hits N_R = r_j[S].sum() # N_R: sum of r_j for g_j \in S P_hit = S * r_j / N_R # P_hit: fraction of hits weighted by r_j P_miss = (~S) * 1 / (N - N_H) # P_hit: fraction of misses up to position i # 0 added to beginning for plotting, doesn't affect sum x = np.arange(N + 1) y = np.concatenate([[0], np.cumsum(P_hit - P_miss)]) return x, y
def ccf(x, y, lags=None, bin_method='gaussian', bin_width=None, max_gap=np.inf, min_obs=10, output="ccf"): """Method to calculate the cross-correlation function for irregular timesteps based on the slotting technique. Different methods (kernels) to bin the data are available. Parameters ---------- x, y: pandas.Series Pandas Series containig the values to calculate the cross-correlation for. The index has to be a Pandas.DatetimeIndex lags: numpy.array, optional numpy array containing the lags in days for which the cross-correlation if calculated. [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 30, 61, 90, 120, 150, 180, 210, 240, 270, 300, 330, 365] bin_method: str, optional method to determine the type of bin. Options are "gaussian" (default), sinc and rectangle. bin_width: float, optional number of days used as the width for the bin to calculate the correlation. By default these values are chosed based on the bin_method. max_gap: float, optional Maximum timestep gap in the data. All timesteps above this gap value are not used for calculating the average timestep. This can be helpfull when there is a large gap in the data that influences the average timestep. Returns ------- CCF: pandas.Series The Cross-correlation function. References ---------- Rehfeld, K., Marwan, N., Heitzig, J., Kurths, J. (2011). Comparison of correlation analysis techniques for irregularly sampled time series. Nonlinear Processes in Geophysics. 18. 389-404. 10.5194 pg-18-389-2011. Examples -------- acf = ps.stats.ccf(x, y, bin_method="gaussian") """ # prepare the time indices for x and y dt_x = x.index.to_series().diff().values / Timedelta(1, "D") dt_x[0] = 0.0 dt_x_mu = dt_x[dt_x < max_gap].mean() # Deal with big gaps if present t_x = np.cumsum(dt_x) dt_y = y.index.to_series().diff().values / Timedelta(1, "D") dt_y[0] = 0.0 dt_y_mu = dt_y[dt_y < max_gap].mean() t_y = np.cumsum(dt_y) dt_mu = max(dt_x_mu, dt_y_mu) # Create matrix with time differences t1, t2 = np.meshgrid(t_x, t_y) t = np.abs(np.subtract(t1, t2)) # absolute values # Normalize the values and create numpy arrays x = (x.values - x.values.mean()) / x.values.std() y = (y.values - y.values.mean()) / y.values.std() # Create matrix for covariances xy = np.outer(y, x) if lags is None: # Default lags in Days, log-scale between 0 and 365. lags = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 30, 61, 90, 120, 150, 180, 210, 240, 270, 300, 330, 365] # Remove lags that cannot be determined because lag < dt_min u, i = np.unique(dt_x, return_counts=True) dt_x_min = u[Series(i, u).cumsum() >= min_obs][0] u, i = np.unique(dt_y, return_counts=True) dt_y_min = u[Series(i, u).cumsum() >= min_obs][0] dt_min = min(dt_x_min, dt_y_min) # dt_min = min(dt_x[1:].min(), dt_y[1:].min()) lags = np.array([float(lag) for lag in lags if lag >= dt_min or lag == 0]) # Delete to free memory del (x, y, dt_x, dt_y, t1, t2, t_x, t_y) # Select appropriate bin_width, default depend on bin_method if bin_width is None: options = {"rectangle": 0.5, "sinc": 1, "gaussian": 0.25} bin_width = np.ones_like(lags) * options[bin_method] * dt_mu elif type(bin_width) is float: bin_width = np.ones_like(lags) else: bin_width = [0.5, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2] # Select the binning method to calculate the cross-correlation if bin_method == "rectangle": a = np.zeros_like(t, dtype=float) kernel_func = lambda d, h: np.less_equal(np.abs(d, out=a), h, out=a).astype(int) elif bin_method == "gaussian": a = np.zeros_like(t, dtype=float) def kernel_func(d, h): den1 = -2 * h ** 2 # denominator 1 den2 = np.sqrt(2 * np.pi * h) # denominator 2 return np.exp(np.square(d, out=a) / den1, out=a) / den2 elif bin_method == "sinc": kernel_func = lambda d, h: np.sin(np.pi * h * d) / (np.pi * h * d) else: raise NotImplementedError("bin_method %s is not implemented." % bin_method) # Pre-allocate an array to speed up all numpy methods UDCF = np.zeros_like(lags, dtype=float) M = np.zeros_like(lags, dtype=float) d = np.zeros_like(t, dtype=float) for i, k in enumerate(lags): # Construct the kernel for the lag np.subtract(t, k, out=d) h = bin_width[i] b = kernel_func(d, h) c = np.multiply(xy, b, out=d) # Element-wise multiplication UDCF[i] = np.sum(c) M[i] = np.sum(b) DCF = UDCF / M C = Series(data=DCF, index=lags, name="CCF") CCF = C / C.abs().max() if output == "full": CCFstd = np.sqrt((np.cumsum(UDCF) - M * DCF) ** 2) / (M - 1) CCF = DataFrame(data={"CCF": CCF.values, "stderr": CCFstd}, index=lags) CCF.index.name = "Lags (Days)" return CCF
def round(series: pd.Series, decimals: int = 0) -> pd.Series: if not decimals: return (-(np.sign(series)) * np.ceil(-(series.abs()) - 0.5)).astype(np.int64) return series.round(decimals=decimals)
iteration = 0 for exp in experiments: train = [c for c in conditions if not c.startswith(exp)] test = [c for c in conditions if c.startswith(exp)] yss, xss = ys.ix[m, train], xs[train].T lm = ElasticNet(alpha=0.01).fit(xss, yss) pred, meas = lm.predict(xs[test].T), ys.ix[m, test] coefs = Series(lm.coef_, index=xss.columns) cor, pval = pearsonr(list(pred), list(meas)) m_coef['%d' % iteration] = coefs.abs().to_dict() iteration += 1 lm_res[m] = DataFrame(m_coef).mean(1).to_dict() print '[INFO] Associations performed' # Conditions betas info_table = DataFrame([(f, m, lm_res[m][f]) for m in lm_res for f in lm_res[m]], columns=['feature', 'metabolite', 'coef']) # # # info_table = info_table[[i in db_proteins for i in info_table['feature']]] # info_table = info_table[[i in db_ions for i in info_table['metabolite']]]
def _zero_nans(out: pd.Series, a: pd.Series) -> pd.Series: a_almost_zero = a.abs() < 1e-16 # type: ignore nans = pd.isnull(out) # nan when 0 / 0, a > 0 /0 => inf out[a_almost_zero & nans] = 0.0 # type: ignore return out
def features(x: pd.Series) -> pd.DataFrame: df = pd.DataFrame(dtype=np.float64) df.loc[1, 'ave'] = x.values.mean() df.loc[1, 'std'] = x.values.std() df.loc[1, 'max'] = x.values.max() df.loc[1, 'min'] = x.values.min() df.loc[1, 'q90'] = np.quantile(x.values, 0.90) df.loc[1, 'q95'] = np.quantile(x.values, 0.95) df.loc[1, 'q99'] = np.quantile(x.values, 0.99) df.loc[1, 'q05'] = np.quantile(x.values, 0.05) df.loc[1, 'q10'] = np.quantile(x.values, 0.10) df.loc[1, 'q01'] = np.quantile(x.values, 0.01) df.loc[1, 'abs_max'] = np.abs(x.values).max() df.loc[1, 'abs_mean'] = np.abs(x.values).mean() df.loc[1, 'abs_std'] = np.abs(x.values).std() df.loc[1, 'trend'] = Base._add_trend_feature(x.values) df.loc[1, 'abs_trend'] = Base._add_trend_feature(x.values, abs_values=True) # New features - rolling features for w in [10, 50, 100, 1000]: x_roll_std = x.rolling(w).std().dropna().values x_roll_mean = x.rolling(w).mean().dropna().values x_roll_abs_mean = x.abs().rolling(w).mean().dropna().values df.loc[1, 'ave_roll_std_' + str(w)] = x_roll_std.mean() df.loc[1, 'std_roll_std_' + str(w)] = x_roll_std.std() df.loc[1, 'max_roll_std_' + str(w)] = x_roll_std.max() df.loc[1, 'min_roll_std_' + str(w)] = x_roll_std.min() df.loc[1, 'q01_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.01) df.loc[1, 'q05_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.05) df.loc[1, 'q10_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.10) df.loc[1, 'q95_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.95) df.loc[1, 'q99_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.99) df.loc[1, 'ave_roll_mean_' + str(w)] = x_roll_mean.mean() df.loc[1, 'std_roll_mean_' + str(w)] = x_roll_mean.std() df.loc[1, 'max_roll_mean_' + str(w)] = x_roll_mean.max() df.loc[1, 'min_roll_mean_' + str(w)] = x_roll_mean.min() df.loc[1, 'q01_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.01) df.loc[1, 'q05_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.05) df.loc[1, 'q95_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.95) df.loc[1, 'q99_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.99) df.loc[1, 'ave_roll_abs_mean_' + str(w)] = x_roll_abs_mean.mean() df.loc[1, 'std_roll_abs_mean_' + str(w)] = x_roll_abs_mean.std() df.loc[1, 'max_roll_abs_mean_' + str(w)] = x_roll_abs_mean.max() df.loc[1, 'min_roll_abs_mean_' + str(w)] = x_roll_abs_mean.min() df.loc[1, 'q01_roll_abs_mean_' + str(w)] = np.quantile( x_roll_abs_mean, 0.01) df.loc[1, 'q05_roll_abs_mean_' + str(w)] = np.quantile( x_roll_abs_mean, 0.05) df.loc[1, 'q95_roll_abs_mean_' + str(w)] = np.quantile( x_roll_abs_mean, 0.95) df.loc[1, 'q99_roll_abs_mean_' + str(w)] = np.quantile( x_roll_abs_mean, 0.99) return df
class MySeries: def __init__(self, *args, **kwargs): self.x = Series(*args, **kwargs) self.values = self.x.values self.index = self.x.index def rolling_mean(self, *args, **kwargs): return MySeries(pd.rolling_mean(self.x, *args, **kwargs)) def rolling_count(self, *args, **kwargs): return MySeries(pd.rolling_count(self.x, *args, **kwargs)) def rolling_sum(self, *args, **kwargs): return MySeries(pd.rolling_sum(self.x, *args, **kwargs)) def rolling_median(self, *args, **kwargs): return MySeries(pd.rolling_median(self.x, *args, **kwargs)) def rolling_min(self, *args, **kwargs): return MySeries(pd.rolling_min(self.x, *args, **kwargs)) def rolling_max(self, *args, **kwargs): return MySeries(pd.rolling_max(self.x, *args, **kwargs)) def rolling_std(self, *args, **kwargs): return MySeries(pd.rolling_std(self.x, *args, **kwargs)) def rolling_var(self, *args, **kwargs): return MySeries(pd.rolling_var(self.x, *args, **kwargs)) def rolling_skew(self, *args, **kwargs): return MySeries(pd.rolling_skew(self.x, *args, **kwargs)) def rolling_kurtosis(self, *args, **kwargs): return MySeries(pd.rolling_kurtosis(self.x, *args, **kwargs)) def rolling_window(self, *args, **kwargs): return MySeries(pd.rolling_window(self.x, *args, **kwargs)) def cumprod(self, *args, **kwargs): return MySeries(self.x.cumprod(*args, **kwargs)) def cumsum(self, *args, **kwargs): return MySeries(self.x.cumsum(*args, **kwargs)) def diff(self, *args, **kwargs): return MySeries(self.x.diff(*args, **kwargs)) def div(self, *args, **kwargs): return MySeries(self.x.div(*args, **kwargs)) def mul(self, *args, **kwargs): return MySeries(self.x.mul(*args, **kwargs)) def add(self, *args, **kwargs): return MySeries(self.x.add(*args, **kwargs)) def dropna(self, *args, **kwargs): return MySeries(self.x.dropna(*args, **kwargs)) def fillna(self, *args, **kwargs): return MySeries(self.x.fillna(*args, **kwargs)) def floordiv(self, *args, **kwargs): return MySeries(self.x.floordiv(*args, **kwargs)) def mod(self, *args, **kwargs): return MySeries(self.x.mod(*args, **kwargs)) def nlargest(self, *args, **kwargs): return MySeries(self.x.nlargest(*args, **kwargs)) def nonzero(self, *args, **kwargs): return MySeries(self.x.nonzero(*args, **kwargs)) def nsmallest(self, *args, **kwargs): return MySeries(self.x.nsmallest(*args, **kwargs)) def pow(self, *args, **kwargs): return MySeries(self.x.pow(*args, **kwargs)) def rank(self, *args, **kwargs): return MySeries(self.x.rank(*args, **kwargs)) def round(self, *args, **kwargs): return MySeries(self.x.round(*args, **kwargs)) def shift(self, *args, **kwargs): return MySeries(self.x.shift(*args, **kwargs)) def sub(self, *args, **kwargs): return MySeries(self.x.sub(*args, **kwargs)) def abs(self, *args, **kwargs): return MySeries(self.x.abs(*args, **kwargs)) def clip(self, *args, **kwargs): return MySeries(self.x.clip(*args, **kwargs)) def clip_lower(self, *args, **kwargs): return MySeries(self.x.clip_lower(*args, **kwargs)) def clip_upper(self, *args, **kwargs): return MySeries(self.x.clip_upper(*args, **kwargs)) def interpolate(self, *args, **kwargs): return MySeries(self.x.interpolate(*args, **kwargs)) def resample(self, *args, **kwargs): return MySeries(self.x.resample(*args, **kwargs)) def replace(self, *args, **kwargs): return MySeries(self.x.replace(*args, **kwargs))
def turnover(series:pd.Series): ratios = series.diff().abs() / series.abs().rolling(window=system.n_bday_in_3m).mean() * system.n_bday_in_year return ratios
# array([0, 5, 6, 0, 0]) trunc(fser100) # array([0, 1, 2, 3, 4, 5]) trunc(fser200) # array([90, 0, 67, 12, 0, 79], dtype=int64) trunc(fser300) # array([50, 24, 56, 89, 33], dtype=int64) # now numpy array, the index is lost # Statistics; fser300.mean() fser300.std() fser300.max() fser300.idxmax() fser300.cumsum() fser200.abs() # Using python functions; fser300.apply(lambda x: x if x > 40 else 40) fser600 = Series(["Baby", "Girl", "Boy", "Woman", "Man"], index=["Ba", "Gi", "Bo", "Wo", "Ma"]) myobj = {'Baby': 10, 'Girl': 22, 'Boy': 45, 'Woman': 90, 'Man': 89} fser700 = Series(myobj) fser600.map(myobj) fser600.map(fser700) fser300.map(lambda x: x if x > 40 else 40)
def normalize(values: _pd.Series) -> _pd.Series: ''' Maps series of values to scale from -1.0 to 1.0 and returns it as a new Series object. ''' min = values.abs().min() max = values.abs().max() return (values - min) / (max - min)
def _get_finite_bounds(numbers: Series) -> Tuple[float, float]: finite_numbers = numbers[numbers.abs() != float("inf")] return finite_numbers.min(), finite_numbers.max()