def _clean_description(self, element): ''' cleans up aggregated descriptions ''' data = [x.getText() for x in element] data = Series(data) data = data.apply(lambda x: re.sub('\n', ' ', x)) mask = data.apply(lambda x: False if re.search('\(sanded|\(sealed|\(endgrain|\(curl|\(burl|^$', x) else True) data = data[mask] def func(item): try: return list(re.search('(.*?):(.*)', item).groups()) except: return [item, None] data = data.apply(func).tolist() data = DataFrame(data, columns=['heading', 'content']) mask = data.content.apply(lambda x: pd.notnull(x)) if mask.shape[0] > 0: mask.ix[0] = True data = data[mask] return data
def test_apply(self, datetime_series): with np.errstate(all='ignore'): tm.assert_series_equal(datetime_series.apply(np.sqrt), np.sqrt(datetime_series)) # element-wise apply import math tm.assert_series_equal(datetime_series.apply(math.exp), np.exp(datetime_series)) # empty series s = Series(dtype=object, name='foo', index=pd.Index([], name='bar')) rs = s.apply(lambda x: x) tm.assert_series_equal(s, rs) # check all metadata (GH 9322) assert s is not rs assert s.index is rs.index assert s.dtype == rs.dtype assert s.name == rs.name # index but no data s = Series(index=[1, 2, 3]) rs = s.apply(lambda x: x) tm.assert_series_equal(s, rs)
def test_apply(self): assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts)) # elementwise-apply import math assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) # how to handle Series result, #2316 result = self.ts.apply(lambda x: Series( [x, x ** 2], index=['x', 'x^2'])) expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) tm.assert_frame_equal(result, expected) # empty series s = Series(dtype=object, name='foo', index=pd.Index([], name='bar')) rs = s.apply(lambda x: x) tm.assert_series_equal(s, rs) # check all metadata (GH 9322) self.assertIsNot(s, rs) self.assertIs(s.index, rs.index) self.assertEqual(s.dtype, rs.dtype) self.assertEqual(s.name, rs.name) # index but no data s = Series(index=[1, 2, 3]) rs = s.apply(lambda x: x) tm.assert_series_equal(s, rs)
def test_series_map_box_timestamps(self): # GH#2689, GH#2627 ser = Series(pd.date_range('1/1/2000', periods=10)) def func(x): return (x.hour, x.day, x.month) # it works! ser.map(func) ser.apply(func)
def test_series_map_box_timedelta(self): # GH 11349 s = Series(timedelta_range('1 day 1 s', periods=5, freq='h')) def f(x): return x.total_seconds() s.map(f) s.apply(f) DataFrame(s).applymap(f)
def test_apply_same_length_inference_bug(self): s = Series([1, 2]) f = lambda x: (x, x + 1) result = s.apply(f) expected = s.map(f) assert_series_equal(result, expected) s = Series([1, 2, 3]) result = s.apply(f) expected = s.map(f) assert_series_equal(result, expected)
def test_filter_against_workaround(): np.random.seed(0) # Series of ints s = Series(np.random.randint(0, 100, 1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Series of floats s = 100 * Series(np.random.random(1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Set up DataFrame of ints, floats, strings. from string import ascii_lowercase letters = np.array(list(ascii_lowercase)) N = 1000 random_letters = letters.take(np.random.randint(0, 26, N)) df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), 'floats': N / 10 * Series(np.random.random(N)), 'letters': Series(random_letters)}) # Group by ints; filter on floats. grouped = df.groupby('ints') old_way = df[grouped.floats. transform(lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) tm.assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) old_way = df[grouped.letters. transform(lambda x: len(x) < N / 10).astype('bool')] new_way = grouped.filter(lambda x: len(x.letters) < N / 10) tm.assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby('letters') old_way = df[grouped.ints. transform(lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) tm.assert_frame_equal(new_way, old_way)
def normalize_lengths(iterable, shift_right=False): ''' Given an iterable of sequences as strings, returns a pandas Series of sequences with Ns added the end to make them all the same length. If shift_right, adds Ns to the beginning instead of the end. ''' series = Series(iterable) max_length = series.apply(len).max() def normalize_seq(seq): ns = 'N'*(max_length - len(seq)) if shift_right: return ns + seq else: return seq + ns return series.apply(normalize_seq)
def select_known_events(gauge_dict, base, precipitation, precip=True, date='2006-01-01', buffer_days=25): s = data[base]['Q_cfs'] name = gauge_dict.keys()[0] date_obj = to_datetime(date) rng = s[date_obj - Timedelta(days=buffer_days): date_obj + Timedelta(days=buffer_days)] if precip: fig, ax1 = plt.subplots() ax1.plot(rng, 'k', label='Discharge [cfs]') ax1.legend() plt.title('Discharge at {} Event Peak: {}'.format(name, date)) ppt_s = Series(array(precipitation)[:, 1], index=array(precipitation)[:, 0]) ppt_s = ppt_s.reindex(index=rng.index, method=None) ppt_s = ppt_s.apply(to_numeric) ppt_s[ppt_s < 0] = 0.0 ppt_s = ppt_s[date_obj - Timedelta(days=buffer_days): date_obj + Timedelta(days=buffer_days)] ax1.set_xlabel('Date') ax1.set_ylabel('[cfs]') for tl in ax1.get_yticklabels(): tl.set_color('k') ax2 = ax1.twinx() ax2.bar(ppt_s.index, ppt_s, width=0.1, label='Precipitation [mm/hr]') plt.gca().invert_yaxis() ax2.set_ylabel('[mm]') for tl in ax2.get_yticklabels(): tl.set_color('b') ax2.legend() if not precip: plt.plot(rng) plt.plot(rng, 'k') plt.legend() plt.title('Discharge at {} Event Peak: {}'.format(name, date)) plt.xlabel('Date') plt.ylabel('[cfs]')
def test_end_time_timevalues(self, input_vals): # GH 17157 # Check that the time part of the Period is adjusted by end_time # when using the dt accessor on a Series input_vals = PeriodArray._from_sequence(np.asarray(input_vals)) s = Series(input_vals) result = s.dt.end_time expected = s.apply(lambda x: x.end_time) tm.assert_series_equal(result, expected)
def test_date_tz(self): # GH11757 rng = pd.DatetimeIndex(['2014-04-04 23:56', '2014-07-18 21:24', '2015-11-22 22:14'], tz="US/Eastern") s = Series(rng) expected = Series([date(2014, 4, 4), date(2014, 7, 18), date(2015, 11, 22)]) assert_series_equal(s.dt.date, expected) assert_series_equal(s.apply(lambda x: x.date()), expected)
def rolling_mean(data, window, min_periods=1, center=False): if len(data) < 2: return data ''' Function that computes a rolling mean Parameters ---------- data : DataFrame or Series If a DataFrame is passed, the rolling_mean is computed for all columns. window : int or string If int is passed, window is the number of observations used for calculating the statistic, as defined by the function pd.rolling_mean() If a string is passed, it must be a frequency string, e.g. '90S'. This is internally converted into a DateOffset object, representing the window size. min_periods : int Minimum number of observations in window required to have a value. Returns ------- Series or DataFrame, if more than one column ''' def f(x): '''Function to apply that actually computes the rolling mean''' offset = pd.datetools.to_offset(window) if center == False: dslice = col[x-offset.delta+timedelta(0,0,1):x] # adding a microsecond because when slicing with labels start and endpoint # are inclusive else: dslice = col[x-offset.delta/2+timedelta(0,0,1): x+pd.datetools.to_offset(window).delta/2] if dslice.size < min_periods: return np.nan else: return dslice.mean() data = DataFrame(data.copy()) dfout = DataFrame() if isinstance(window, int): dfout = pd.rolling_mean(data, window, min_periods=min_periods, center=center) elif isinstance(window, basestring): idx = Series(pd.to_datetime(data.index), index=data.index) for colname, col in data.iterkv(): result = idx.apply(f) result.name = colname dfout = dfout.join(result, how='outer') if dfout.columns.size == 1: dfout = dfout.ix[:,0] return dfout
def schedule(self, positions: List[Position], data_ser: pd.Series, context: BacktestContext): avg: pd.Series = data_ser.apply(lambda x: x.open).rolling(self.lookback).mean() todays_price_ = data_ser[-1].open instruction = None if data_ser[-2].open <= avg[-2] and todays_price_ > avg[-1]: instruction = TradeInstruction(todays_price_, todays_price_ - as_price(self.stop_value, data_ser.name), self.risk_per_trade, data_ser.name, data_ser.index[-1]) if data_ser[-2].open >= avg[-2] and todays_price_ < avg[-1]: instruction = TradeInstruction(todays_price_, todays_price_ + as_price(self.stop_value, data_ser.name), -self.risk_per_trade, data_ser.name, data_ser.index[-1]) return instruction
def adjustDays(normedDays, period): for i,day in enumerate(normedDaysAdj): normedDaysAdj[i] = Series(day, index=array(day.index, dtype=float) - adj[i]) n = reduce(Series.append, normedDays) idx = array(n.index.copy(),dtype=float) + arange(len(n))*10e-5 n = Series(array(n), idx % period) idx_rel = array(n.index.copy(), dtype=float) n = n.sort_index() n = n.apply(cycle_adjust) n = Series(array(n[idx_rel]), index = idx) adj = split(arange(len(idx))*10e-5, cumsum(map(len, normedDays))) normedDaysAdj = [n[array(d.index, dtype=float) + adj[i]] for (i,d) in enumerate(normedDays)] for i,day in enumerate(normedDaysAdj): normedDaysAdj[i] = Series(day, index=array(day.index, dtype=float) - adj[i]) return normedDaysAdj
def convert_series_of_lists_to_df(column: pd.Series, prefix='', prefix_sep=''): """ input: index groups 0 ['a','b','c'] 1 ['c'] 2 ['b','c','e'] 3 ['a','c'] 4 ['b','e'] output: index a b c d e 0 1 1 1 0 0 1 0 0 1 0 0 2 0 1 1 0 1 3 1 0 1 0 0 4 0 1 0 0 0 """ return pd.get_dummies(column.apply(pd.Series), prefix=prefix, prefix_sep=prefix_sep).sum(level=0, axis=1)
def test_apply_empty(self, float_frame, empty_frame): # empty applied = empty_frame.apply(np.sqrt) assert applied.empty applied = empty_frame.apply(np.mean) assert applied.empty no_rows = float_frame[:0] result = no_rows.apply(lambda x: x.mean()) expected = Series(np.nan, index=float_frame.columns) assert_series_equal(result, expected) no_cols = float_frame.loc[:, []] result = no_cols.apply(lambda x: x.mean(), axis=1) expected = Series(np.nan, index=float_frame.index) assert_series_equal(result, expected) # GH 2476 expected = DataFrame(index=['a']) result = expected.apply(lambda x: x['a'], axis=1) assert_frame_equal(expected, result)
def getPointArray(conshpfn): conShp = shapefile.Reader(conshpfn) conShapes = conShp.shapes() conShapeArray = [] for conShape in conShapes: numOfShapePoints = len(conShape.points) conShapePartArray = copy.deepcopy(conShape.parts) conShapePartArray.append(numOfShapePoints) partPointsArray = [] for partIndex in range(len(conShape.parts)): partPointsArray.append(conShape.points[conShapePartArray[partIndex]:conShapePartArray[partIndex+1]]) partPointsSeries = Series(partPointsArray) numOfPartPointsSeries = partPointsSeries.apply(lambda x: len(x)) numOfPartPointsSeries = numOfPartPointsSeries.rank(method = 'first') rankDic = {} for i,numOfPartPointsSeriesItem in enumerate(numOfPartPointsSeries): rankDic[numOfPartPointsSeriesItem] = partPointsSeries[i] rankDicKeys = rankDic.keys() rankDicKeys.sort(reverse=True) sortedPartPointsArray = [] for rankDicKey in rankDicKeys: sortedPartPointsArray.append(rankDic[rankDicKey]) conShapeArray.append(sortedPartPointsArray) return conShapeArray
def rolling_timeslice_apply(data, window, func,min_periods=1,direc='F'): def f(x): '''Function to apply that actually computes the rolling mean''' time_slice = col[str(x):str(x+pdX.datetools.to_offset(window).delta)] if(direc == 'B'): time_slice = col[str(x-pdX.datetools.to_offset(window).delta):str(x)][::-1] if time_slice.size < min_periods: return np.nan else: return func(time_slice) data = DataFrame(data.copy()) dfout = DataFrame() if isinstance(window, basestring): idx = Series(data.index.to_pydatetime(), index=data.index) for colname, col in data.iterkv(): result = idx.apply(f) result.name = colname dfout = dfout.join(result, how='outer') if dfout.columns.size == 1: dfout = dfout.ix[:,0] return dfout
def stem(self, raw: pd.Series): return raw.apply( lambda x: [ self.nlp.stem(word) for word in self.nlp.tokenize(x) if self.nlp.stem(word) is not None ] )
def test_apply_args(): s = Series(["foo,bar"]) result = s.apply(str.split, args=(",",)) assert result[0] == ["foo", "bar"] assert isinstance(result[0], list)
def MLE_fast_func_with_p0t_close_to_mean_of_prev_week( self, p0t: list, num_of_prev_dayes=7): """ Can only work if beta is not 0 substritutes p0 :param p0t: a dict= variables for substitute :return: list of the results after substituting of p0 """ if self.beta == 0: raise KeyError ##### # p0t = [min(i, 0.999) for i in p0t] ##### series_p0 = Series(p0t, self.N_time) M = self.Mi(series_p0) fixed_k = self.K.swaplevel(0, 1) # fixed_n = self.N.swaplevel(0, 1) part_a = DataFrame(index=self.N_time, columns=self.N_feature, dtype=float) part_c = DataFrame(index=self.N_time, columns=self.N_feature, dtype=float) upper_part_for_first_argument = DataFrame(index=self.N_time, columns=self.N_feature) for i in self.N_feature: upper_part_for_first_argument[i] = fixed_k[i].apply(lambda x: (x + self.alpha) * M[i]) - \ self.fixed_n_without_sigma[i] part_a[i] = fixed_k[i].apply(lambda x: self.sigma**2 * ( x + self.alpha) * M[i]) - self.fixed_n_with_sigma[i] part_c[i] = series_p0.apply(lambda x: M[i] - x * self.teta[i]) # part_a = part_a.swapaxes(0, 1) part_c = part_c.swapaxes(0, 1) upper_part_for_first_argument = upper_part_for_first_argument.swapaxes( 0, 1) return_list = np.array([ sum(upper_part_for_first_argument[self.N_time[0]] / (series_p0[self.N_time[0]] * part_c[self.N_time[0]])) ]) for t in self.N_time[1:num_of_prev_dayes]: part_b = series_p0[t] * ( series_p0[t] - series_p0[self.N_time[self.N_time.index(t) - 1]]) # Memory error with alot of days clean memory for avoiding memory error # gc.collect() return_list = np.append( return_list, sum((part_a.swapaxes(0, 1)[t] - (part_b * part_c[t])) / part_c[t])) for t in self.N_time[num_of_prev_dayes:]: part_b = series_p0[t] * (series_p0[t] - statistics.mean( series_p0[self.N_time[self.N_time.index(t) - num_of_prev_dayes]:self. N_time[self.N_time.index(t)]])) # Memory error with alot of days clean memory for avoiding memory error # gc.collect() return_list = np.append( return_list, sum((part_a.swapaxes(0, 1)[t] - (part_b * part_c[t])) / part_c[t])) return return_list
def test_apply_dont_convert_dtype(self): s = Series(np.random.randn(10)) f = lambda x: x if x > 0 else np.nan result = s.apply(f, convert_dtype=False) assert result.dtype == object
def to_ip(series: pd.Series) -> pd.Series: return series.apply(ip_address)
def validate(self, series: pd.Series) -> pd.Series: return series.apply(self.can_call)
def convert_time_steps_to_year(self, time_steps: pd.Series) -> pd.Series: """converts a number of time steps relative to reference date into absolute dates""" return time_steps.apply(lambda x: np.round( x * self.time_step + self.reference_year, SIG_FIGS))
emp = pd.read_csv("c:/r/emp.csv",names = ["empid","name","job","mgr","hire_date","sal","comm","deptno"]) emp[emp['comm'].notnull()][['name','comm']] apply라는 메소드 공부 s1 = Series([1,2,3]) s1**2 def square(x): return x**2 square(2) apply함수는 행, 열값을 인수값으로 받아서 반복하여 그 함수를 적용한다. s1.apply(square) s1.apply(lambda x: x**2) df = DataFrame([[1,2,3],[4,5,6]]) df.apply(square,axis = 0) #0 : 각 컬럼이 함수에 적용, 1: 각 row가 함수에 적용 df[0].apply(square) import numpy as np df.apply(np.sum, axis = 0) df.apply(np.sum, axis = 1) df.apply(lambda x: x**2)
def reconstruct_taxonomy(reconstruction_map: pd.Series, taxonomy: pd.Series, database: str='none', define_missing: str='merge', ambiguity_handling: str='ignore', ) -> pd.Series: """ Reconstructs the taxonomic annotation based on a sidle database by identifying the lowest taxonomic level where the taxonomic annotation diverges for two sequences Parameters ---------- reconstruction_map : pd.Series The relationship between raw sequences and the reconstructed sidle database taxonomy: pd.Series A taxonomic description of each sequence database: {'greengenes', 'silva', 'none'} The database used taxonomy. This is important for selecting the correct taxonomic delimiter and for removing missing sequences. define_missing: {'merge'; 'inherit'; 'ignore'} Taxonomic strings may be missing information (for example `g__` in greengenes or `D_5__uncultured bacteria` in Silva). These can be ignored (`"ignore"`) and treated like any other taxonomic designation; they can be first inherited in merged sequences (`"merge`"), where, when there are two strings being merged and one has a missing level, the missing level is taken form the defined one, or they can be inherited from the previous level (`"inherit"`) first, and then merged. ambiguity_handling: {'missing', 'ignore'} whether "ambigious taxa" (Silva-specific) should be treated as missing values (`"missing`") or ignored (`"ignore"`) Returns ------- pd.Series A series describing the new taxonomy """ if (database == 'none') & (define_missing != 'ignore'): warnings.warn('When no database is specified, ' 'missing values are ignored by default', UserWarning) if (database == 'none') & (ambiguity_handling != 'ignore'): warnings.warn('When no database is specified, ' 'ambiguious values are ignored by default', UserWarning) if (database == 'greengenes') and (ambiguity_handling != 'ignore'): warnings.warn('Greengenes does not include ambigious taxa. The ' 'ambiguity handling will be ignored.', UserWarning) # Filters the taxonomy and converts to levels db_lookup = database_params.get(database, 'none') delim = db_lookup['delim'] def split_taxonomy(x): return pd.Series([s.strip(' ') for s in x.split(delim)]) taxonomy = taxonomy.loc[reconstruction_map.index] taxonomy = taxonomy.apply(split_taxonomy) taxonomy.index.set_names('Feature ID', inplace=True) if len(taxonomy.columns) == 1: raise ValueError('Only one taxonomic level was found. Please check ' 'your database and delimiter.') # Finds the undefined levels defined_f = db_lookup['defined'] undefined_levels = ~pd.concat(axis=1, objs=[ taxonomy[c].apply(defined_f) for c in taxonomy.columns ]) ambigious_levels = pd.concat(axis=1, objs=[ taxonomy[c].apply(lambda x: 'ambig' in x) for c in taxonomy.columns ]) ambigious_levels = ambigious_levels.cummax(axis=1) undefined = (undefined_levels | (ambigious_levels & (ambiguity_handling == 'missing')) ).astype(bool) # Filters missing taxa and hanldes initial inherietence. if define_missing != 'ignore': taxonomy.mask(undefined, np.nan, inplace=True) if define_missing == 'inherit': taxonomy.fillna(method='ffill', axis=1, inplace=True) # Combines the taxonomy across multiple levels def _combine_f(x): if pd.isnull(x).all(): return np.nan else: return '|'.join(np.sort(x.dropna().unique())) def _combine_taxa(g): """Help function ot tidy taxonomy""" if len(g) == 1: return g.iloc[0] else: return g.apply(_combine_f) taxonomy['clean_name'] = reconstruction_map collapsed = taxonomy.groupby('clean_name').apply(_combine_taxa) collapsed.drop(columns=['clean_name'], inplace=True) # Finds splits in the data disjoint = pd.concat(axis=1, objs=[ collapsed[c].apply(lambda x: True if pd.isnull(x) else '|' in x) for c in collapsed.columns ]).cummax(axis=1) # Set up inherietence so you inheriet the first split in each row # of the data disjoint_inheriet = (disjoint.cummax(axis=1) & ~((disjoint.cumsum(axis=1) == 1) & disjoint)) collapsed.mask(disjoint_inheriet, np.nan, inplace=True) # Does nan inherietence collapsed.fillna(method='ffill', axis=1, inplace=True) # Returns the summarized taxonomy new_taxa = collapsed.apply(lambda x: delim.join(list(x.values)), axis=1) new_taxa.name = 'Taxon' new_taxa.index.set_names('Feature ID', inplace=True) return new_taxa
def map_dataframe_with_dict_and_default(default: Any, mapping_dict: dict, x: pd.Series): col_dict = mapping_dict[x.name] return x.apply(lambda y: col_dict.get(str(y), default))
def is_column_string(se: pd.Series) -> bool: """ dataframe では str と object の区別がつかない """ if se.apply(lambda x: type(x) == str).sum() == se.shape[0]: return True else: return False
def prices(series: pd.Series, initial: int = 1, type: Returns = Returns.SIMPLE) -> pd.Series: """ Calculate price levels from returns series :param series: time series of returns :param initial: initial price level :param type: returns type: simple, logarithmic or absolute :return: date-based time series of return **Usage** Compute price levels from returns series, based on the value of *type*: =========== ============================= Type Description =========== ============================= simple Simple arithmetic returns logarithmic Logarithmic returns absolute Absolute returns =========== ============================= *Simple* Compute asset price series from simple returns: :math:`Y_t = (1 + X_{t-1}) Y_{t-1}` where :math:`X_t` is the asset price at time :math:`t` and :math:`Y_0 = initial` *Logarithmic* Compute asset price series from logarithmic returns: :math:`Y_t = e^{X_{t-1}} Y_{t-1}` where :math:`X_t` is the asset price at time :math:`t` and :math:`Y_0 = initial` *Absolute* Compute asset price series from absolute returns: :math:`Y_t = X_{t-1} + Y_{t-1}` where :math:`X_t` is the asset price at time :math:`t` and :math:`Y_0 = initial` **Examples** Generate price series and take compute returns >>> series = generate_series(100) >>> returns = prices(returns(series)) **See also** :func:`returns` :func:`product` :func:`exp` """ if series.size < 1: return series if type == Returns.SIMPLE: return product(1 + series) * initial elif type == Returns.LOGARITHMIC: return product(series.apply(math.exp)) * initial elif type == Returns.ABSOLUTE: return sum_(series) + initial else: raise MqValueError( 'Unknown returns type (use simple / Logarithmic / absolute)')
def returns(series: pd.Series, obs: int = 1, type: Returns = Returns.SIMPLE) -> pd.Series: """ Calculate returns from price series :param series: time series of prices :param obs: number of observations :param type: returns type: simple, logarithmic or absolute :return: date-based time series of return **Usage** Compute returns series from price levels, based on the value of *type*: =========== ============================= Type Description =========== ============================= simple Simple arithmetic returns logarithmic Logarithmic returns absolute Absolute returns =========== ============================= *Simple* Simple geometric change in asset prices, which can be aggregated across assets :math:`Y_t = \\frac{X_t}{X_{t-obs}} - 1` where :math:`X_t` is the asset price at time :math:`t` *Logarithmic* Natural logarithm of asset price changes, which can be aggregated through time :math:`Y_t = log(X_t) - log(X_{t-obs})` where :math:`X_t` is the asset price at time :math:`t` *Absolute* Absolute change in asset prices :math:`Y_t = X_t - X_{t-obs}` where :math:`X_t` is the asset price at time :math:`t` **Examples** Generate price series and take compute returns >>> prices = generate_series(100) >>> returns = returns(prices) **See also** :func:`prices` """ if series.size < 1: return series if type == Returns.SIMPLE: ret_series = series / series.shift(obs) - 1 elif type == Returns.LOGARITHMIC: log_s = series.apply(math.log) ret_series = log_s - log_s.shift(obs) elif type == Returns.ABSOLUTE: ret_series = series - series.shift(obs) else: raise MqValueError( 'Unknown returns type (use simple / logarithmic / absolute)') return ret_series
emp[~pd.isnull(emp['comm_pct'])][['name','comm_pct']] s1 = Series([1,2,3]) s1 s1**2 # R처럼 연산가능 def square(x): return x**2 square(s1) # apply함수는 인수값으로 행,열값을 인수값으로 받아서 반복하여 그 함수를 적용 s1.apply(square) s1.apply(lambda x : x**2) df = DataFrame([[1,2,3],[4,5,6]]) df df.apply(square) df.apply(lambda x : x**2) df[0].apply(square) df[0].apply(lambda x : x**3) df.apply(square, axis = 0) # 0 : 각 컬럼이 함수에 적용 df.apply(square, axis = 1) # 1 : 각 행 함수에 적용 df.ix[0,:]
# # Series.apply(func, convert_dtype=True, args=(), **kwds) # # Invoke function on values of Series. # <codecell> # Let's start by using Series.apply # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.apply.html # first of all, it's useful to find a way to use apply to return the exact same Series def identity(s): return s lower.apply(identity) # <codecell> # show that identity yields the same Series -- first on element by element basis lower.apply(identity) == lower # <codecell> # Check that match happens for every element in the Series using numpy.all # http://docs.scipy.org/doc/numpy/reference/generated/numpy.all.html np.all(lower.apply(identity) == lower) # <headingcell level=2>
def validateType(s: pd.Series, t, text=''): validTypeCount = s.apply(type).value_counts().to_dict()[t] totalTypes = len(s) print(text, str(totalTypes - validTypeCount), ' of ', str(totalTypes), ' types are invalid.')
un_rate['ZIP_county']=un_rate['ZIP'] result1 = pd.merge(data,un_rate, how='inner', on ='ZIP_county') result2 = pd.merge(result1, Unemp, how = 'inner', on = 'county') result11=result2[["county","INSTNM","CrimeRate","Unemp_11","ZIP_county"]].dropna() avg_crimerate = result11['CrimeRate'].mean() avg_unem = result11['Unemp_11'].mean() # calculate high or low crime rate and high or low unemployment result11['CH'] = result11['CrimeRate'] > avg_crimerate result11['CH'] = result11['CH'].astype(int) result11['CL'] = result11['CrimeRate'] < avg_crimerate result11['CL'] = result11['CL'].astype(int) result11['UH'] = result11['Unemp_11'] > avg_unem result11['UH'] = result11['UH'].astype(int) result11['UL'] = result11['Unemp_11'] < avg_unem result11['UL'] = result11['UL'].astype(int) c=DataFrame(result11) c=c.groupby(['UH','UL']).sum() #calculating z score and p value z_scores=[-4.76759,4.907776,4.76759,-4.907776] z_scores=Series(z_scores) p_values_h0 = scipy.stats.norm.sf(abs(z_scores[0]))*2 z_scores=DataFrame(z_scores) z_scores.columns=['z score'] z_scores['p value']=z_scores.apply( lambda x: scipy.stats.norm.sf(abs(z_scores['z score']))*2) print z_scores
def cortical_thickness(xfms : pd.Series, # nlin avg -> subject XfmHandler (iirc)... atlas : MincAtom, # nlin avg label_mapping : FileAtom, atlas_fwhm : float, thickness_fwhm : float): try: import vtk except: warnings.warn("couldn't `import vtk`, without which `decimate.py` is unable to run ...") raise s = Stages() # generate thickness maps for the average: left_grid, right_grid = [s.defer(make_laplace_grid(input_labels=atlas.labels, label_mapping=label_mapping, binary_closing=True, side=side)) for side in (Side.left, Side.right)] atlas_left_thickness, atlas_right_thickness = ( [s.defer(decimate( s.defer(minclaplace(input_grid=grid, extra_args=["--create-surface-range", "0", "10"])).surface, # enclose entire cortex reduction=0.8, # FIXME: magic number ... implement a way to specify number rather than fraction instead? smoothing_method=Smoothing.laplace)) for grid in (left_grid, right_grid)]) # as per comment in MICe_thickness, blur atlas instead of transformed object files ... ? # (maybe this workaround is now obsolete) blurred_atlas = s.defer(mincblur(img=atlas, fwhm=atlas_fwhm)).img # TODO rename this dataframe resampled = (pd.DataFrame( { 'xfm' : xfms, # resample the atlas files to each subject: 'blurred_atlas_grid_resampled' : xfms.apply(lambda xfm: s.defer(mincresample_new(img=blurred_atlas, xfm=xfm.xfm, like=xfm.target))), 'atlas_left_resampled' : xfms.apply(lambda xfm: s.defer(transform_objects(input_obj=atlas_left_thickness, xfm=xfm.xfm))), 'atlas_right_resampled' : xfms.apply(lambda xfm: s.defer(transform_objects(input_obj=atlas_right_thickness, xfm=xfm.xfm))), }) .assign(left_grid=lambda df: df.xfm.map(lambda xfm: s.defer( make_laplace_grid(input_labels=xfm.target, label_mapping=label_mapping, binary_closing=True, side=Side.left))), right_grid=lambda df: df.xfm.map(lambda xfm: s.defer( make_laplace_grid(input_labels=xfm.target, label_mapping=label_mapping, binary_closing=True, side=Side.right)))) .assign(left_thickness=lambda df: df.apply(axis=1, func=lambda row: s.defer(minclaplace(input_grid=row.left_grid, solution_vertices=row.atlas_left_resampled))), right_thickness=lambda df: df.apply(axis=1, func=lambda row: s.defer(minclaplace(input_grid=row.right_grid, solution_vertices=row.atlas_right_resampled)))) .assign(smooth_left_fwhm=lambda df: df.apply(axis=1, func=lambda row: s.defer(diffuse(obj_file=row.atlas_left_resampled, input_signal=row.left_thickness.solved, kernel=thickness_fwhm, iterations=1000))), smooth_right_fwhm=lambda df: df.apply(axis=1, func=lambda row: s.defer(diffuse(obj_file=row.atlas_right_resampled, input_signal=row.right_thickness.solved, kernel=thickness_fwhm, iterations=1000))))) return Result(stages=s, output=resampled)
def threshold(column: pd.Series, threshold: float) -> pd.Series: print(column) column = column.apply(lambda x: 'e' if x <= threshold else 'p') print(column) return column
result = requests.get(url) if result.status_code == 200: #print 'Request succesful' return BeautifulSoup(result.text,"html.parser") else: print 'Request failed', url return None result = requests.post(link, data=pageLoad) soupMed = BeautifulSoup(result.text, "html.parser") #soupMed = BeautifulSoup(result.text) #print soupMed #print soupMed.find("a", {"class": "standart"}) #print soupMed.find("tr td + a") names = [ x.text for x in soupMed.find_all(class_="standart")] #print names #names = [ x.text for x in soupMed.find_all("a", {'clasx_': 'standart'}) names = Series(names) print names.str.strip() #regex_dosage = re.compile(r'\d+') #regex_ """ names.str.strip() names.apply(lambda x : regex.findall(x)) """
def get_status_coluna(self, serie: pd.Series): return serie.apply(self.get_status_individuo)
def euler(max_num): # Series max takes the first element in the tuple. how to use a custom fn? indx = Series(np.arange(max_num + 1)) cols = indx.apply(collatz_length) (maxl, maxi) = cols.max() print "The longest Collatz sequence is for %d = %d" % (maxi, maxl)
def test_apply_args(self): s = Series(['foo,bar']) result = s.apply(str.split, args=(',', )) assert result[0] == ['foo', 'bar'] assert isinstance(result[0], list)
def test_apply_empty_integer_series_with_datetime_index(): # GH 21245 s = Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) result = s.apply(lambda x: x) tm.assert_series_equal(result, s)
def simulate(rates, significances, impressions, numTrials, firstLook=None, estimateFunction=WaldEstimate, seed=None): """ simulate a single-proportion Z-test Args: rate (list): success rates significances (list): significance values (1 - confidence) impressions (int or list): maximum impressions or list of number of impressions numTrials (int): number of independent simulations to aggregate over firstLook (int): first impression at which experiment is evaluated for continuous evaluation (defaults to 1) estimateFunction (function): binomal approximation to use (defaults to Wald) seed (int, optional): seed for random number generation (defaults to current time) Returns: avgRejects (DataFrame): simulate single test read at end avgAnyRejects (DataFrame): simulate conintuous test read after every impression Both DataFrames contain the estimate and uncertainty on the type I error (incorrect rejection of null hypothesis) for each rate, significance, and impression value. Results are aggregated across numTrials independent experiments. """ trials = range(numTrials) base = [rates, significances, trials] mi = pandas.MultiIndex.from_product(base, names=['rate', 'significance', 'trial']) if seed is None: numpy.random.seed(int(time.time())) else: numpy.random.seed(seed) if type(impressions) == int: points = range(1, impressions + 1) else: points = impressions avgRejects = None avgAnyRejects = None for n in points: if n <= 0: raise ValueError("All values in impressions must be positive.") draws = DataFrame(numpy.random.random([n, len(rates) * len(significances) * len(trials)]), columns=mi) draws.index = range(1, n + 1) successes = draws.copy() rejects = draws.copy() for rate in rates: successes[rate] = draws[rate].applymap(lambda x: int(x < rate)) cumSuccesses = successes.apply(numpy.core.fromnumeric.cumsum, raw=True) cumImpressions = successes.index.values for rate in rates: for sig in significances: for trial in trials: vals = Series(zip(cumSuccesses.loc[:, (rate, sig, trial)].values, cumImpressions)) vals.index = cumImpressions rejects.loc[:, (rate, sig, trial)] = vals.apply(lambda x: \ int(rejectNull(estimateFunction(x[0], x[1], sig), rate))) if firstLook is not None: anyRejects = rejects.ix[firstLook:].max() # apply binomial approximation to estimate type I error rate if avgRejects is None: avgRejects = rejects[-1:]. \ groupby(axis=1, level=['rate', 'significance']). \ sum(). \ applymap(lambda x: estimateFunction(x, numTrials)) else: avgRejects.ix[n] = rejects[-1:]. \ groupby(axis=1, level=['rate', 'significance']). \ sum(). \ applymap(lambda x: estimateFunction(x, numTrials)). \ values[0] # apply binomial approximation to estimate cumulative type I error rate if firstLook is not None: if avgAnyRejects is None: avgAnyRejects = DataFrame(anyRejects. \ groupby(level=['rate', 'significance']). \ sum(). \ map(lambda x: estimateFunction(x, numTrials))). \ transpose() avgAnyRejects.index = avgRejects.index.copy() else: avgAnyRejects.ix[n] = anyRejects. \ groupby(level=['rate', 'significance']). \ sum(). \ map(lambda x: estimateFunction(x, numTrials)). \ values return avgRejects, avgAnyRejects
def nom_comps(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]: """ All computations required for plot(df, Nominal) """ # pylint: disable=too-many-branches data: Dict[str, Any] = dict() data["nrows"] = srs.shape[0] # total rows srs = srs.dropna() # drop null values grps = srs.value_counts( sort=False) # counts of unique values in the series data["geo"] = grps if cfg.stats.enable or cfg.bar.enable or cfg.pie.enable: data["nuniq"] = grps.shape[0] # total number of groups # compute bar and pie together unless the parameters are different if cfg.bar.enable or cfg.pie.enable: # select the largest or smallest groups data["bar"] = (grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending else grps.nsmallest(cfg.bar.bars)) if cfg.bar.bars == cfg.pie.slices and cfg.bar.sort_descending == cfg.pie.sort_descending: data["pie"] = data["bar"] else: data["pie"] = (grps.nlargest(cfg.pie.slices) if cfg.pie.sort_descending else grps.nsmallest( cfg.pie.slices)) if cfg.bar.bars == cfg.value_table.ngroups and cfg.bar.sort_descending: data["value_table"] = data["bar"] elif cfg.pie.slices == cfg.value_table.ngroups and cfg.pie.sort_descending: data["value_table"] = data["pie"] else: data["value_table"] = grps.nlargest(cfg.value_table.ngroups) if cfg.insight.enable: data["chisq"] = chisquare(grps.values) df = grps.reset_index() # dataframe with group names and counts if cfg.stats.enable or cfg.wordlen.enable: if not head.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths if cfg.stats.enable or cfg.wordcloud.enable or cfg.wordfreq.enable: if not head.apply(lambda x: isinstance(x, str)).all(): df[df.columns[0]] = df[df.columns[0]].astype(str) if cfg.stats.enable: data.update(_calc_nom_stats(srs, df, data["nrows"], data["nuniq"])) elif cfg.wordfreq.enable and cfg.insight.enable: data["len_stats"] = { "Minimum": srs.str.len().min(), "Maximum": srs.str.len().max() } if cfg.wordlen.enable: lens = srs.str.len() data["len_hist"] = da.histogram(lens, cfg.wordlen.bins, (lens.min(), lens.max())) if cfg.wordcloud.enable or cfg.wordfreq.enable: if all( getattr(cfg.wordcloud, att) == getattr(cfg.wordfreq, att) for att in ("top_words", "stopword", "stem", "lemmatize")): word_freqs = _calc_word_freq( df, cfg.wordfreq.top_words, cfg.wordfreq.stopword, cfg.wordfreq.lemmatize, cfg.wordfreq.stem, ) data["word_cnts_cloud"] = word_freqs["word_cnts"] data["nuniq_words_cloud"] = word_freqs["nuniq_words"] else: word_freqs = _calc_word_freq( df.copy(), cfg.wordfreq.top_words, cfg.wordfreq.stopword, cfg.wordfreq.lemmatize, cfg.wordfreq.stem, ) word_freqs_cloud = _calc_word_freq( df, cfg.wordcloud.top_words, cfg.wordcloud.stopword, cfg.wordcloud.lemmatize, cfg.wordcloud.stem, ) data["word_cnts_cloud"] = word_freqs_cloud["word_cnts"] data["nuniq_words_cloud"] = word_freqs["nuniq_words"] data["word_cnts_freq"] = word_freqs["word_cnts"] data["nwords_freq"] = word_freqs["nwords"] return data
dataframe1 = DataFrame(([1,2,3,4,7,11],[4,5,6,9,5,0],[7,5,8,12,1,11]), columns=['col1','col2','col3','col4','col5','col6']) ser_from_df1 = dataframe1.iloc[:, 0:1] # convert series to an array # NOTE: series does not have a to array method. series10 = Series(['100', '200', 'python', '300.12', '400']) series_to_array = np.array(series10.tolist()) """ Keep this problem in mind """ # convert series of lists to one series ser1 = Series([['Red', 'Green', 'White'], ['Red', 'Black'], ['Yellow']]) ser1_one = ser1.apply(Series).stack().reset_index(drop=True) # Sort a series arr = ['100', '200', 'Python', '300.12', '400'] arr = Series(arr) arr_sort = arr.sort_values() new_arr = arr.append(Series(['500', 'php'])) # New subset of a series based on a given condition ser2 = Series(np.arange(10)) ser2_condition = ser2[ser2 < 6] # change the order of the index ser3 = Series(np.arange(1,6,1), index=['A','B','C','D','E']) ser3 = ser3.reindex(index=['B','A','C','D','E'])
def cut_and_align_helper(base_line:pd.Series, compared_line:pd.Series): """cut and align the two input line Args: base_line (pd.Series): the sequence of coordination compared_line (pd.Series): the sequence of coordination vis_origin_data (bool, optional): [description]. Defaults to True. Returns: [LineString]: segements of compared_line """ # base_line, compared_line = line2_, line1_ # base_line, compared_line = line1_, line2_ mask = compared_line.apply( lambda x: the_foot_point_on_line(x, base_line)) # 特殊情况,当一个线段包含了panos的一整条路段的时候 if mask.sum() == 0: # 此时 base_line 是 panos途径, compared_line 为OSM途径 road_segment = compared_line foot0 = get_foot_point( base_line.iloc[0], road_segment.iloc[0], road_segment.iloc[-1] ) foot1 = get_foot_point( base_line.iloc[-1], road_segment.iloc[0], road_segment.iloc[-1] ) l0 = {'x0': road_segment.iloc[0][0], 'x1': road_segment.iloc[-1][0], 'y0': road_segment.iloc[0][1], 'y1': road_segment.iloc[-1][1], } l1 = {'x0': foot0[0], 'x1': foot1[0], 'y0': foot0[1], 'y1': foot1[1], } coords_new = [foot0, foot1] if -30 <= angle_bet_two_line(l0, l1) <= 30 else [foot1, foot0] return LineString(coords_new) left, right = 0, len(mask)-1 start, end = left, right while not mask.iloc[left]: left += 1 while not mask.iloc[right]: right -= 1 left_foot_point, right_foot_point = [],[] if left != start: panos_segment = compared_line.iloc[left-1: left+1] if the_foot_point_on_line( base_line.iloc[0], panos_segment, ratio_thres=0 ): left_foot_point = get_foot_point( base_line.iloc[0], compared_line.iloc[left-1], compared_line.iloc[left] ) if the_foot_point_on_line( base_line.iloc[-1], panos_segment, ratio_thres=0 ): left_foot_point = get_foot_point( base_line.iloc[-1], compared_line.iloc[left-1], compared_line.iloc[left] ) # assert( len(left_foot_point)!=0 ) if right != end: panos_segment = compared_line.iloc[right: right+2] if the_foot_point_on_line( base_line.iloc[0], panos_segment, ratio_thres=0 ): right_foot_point = get_foot_point( base_line.iloc[0], compared_line.iloc[right], compared_line.iloc[right+1] ) if the_foot_point_on_line( base_line.iloc[-1], panos_segment, ratio_thres=0 ): right_foot_point = get_foot_point( base_line.iloc[-1], compared_line.iloc[right], compared_line.iloc[right+1] ) # assert( len(right_foot_point)!=0 ) # add the foot point to the new coords coords_new = compared_line.iloc[ left: right+1 ].values.tolist() if len(left_foot_point) > 0: coords_new = [left_foot_point] + coords_new if len(right_foot_point) > 0: coords_new = coords_new + [right_foot_point] return LineString( coords_new if len(coords_new) > 1 else coords_new *2 )
def to_uuid(series: pd.Series) -> pd.Series: return series.apply(uuid.UUID)
} url = 'http://base-donnees-publique.medicaments.gouv.fr/index.php#result' result = requests.post(url, data=payload) soup = BeautifulSoup(result.text) names = [ x.text for x in soup.find_all(class_="standart")] #on obtient la zone de texte de notre recherche names = Series(names) # str => transforme en string. strip() => vire les espaces names = names.str.strip() #sur une expression reguliere, trouver les chiffres regex_dosage = re.compile(r'\d+') names.apply(lambda x : regex_dosage.findall(x)) #sur une expression reguliere, trouver unité mg regex_unite = re.compile(r'(microgrammes|µg|grammes|gL)') names.apply(lambda x : regex_unite.findall(x)) regex_form = re.compile(r'comprim\xe9 s\xe9cable') names.apply(lambda x : regex_form.findall(x)) a = names.apply(lambda x : regex_dosage.findall(x)) b = names.apply(lambda x : regex_unite.findall(x)) c = names.apply(lambda x : regex_form.findall(x)) d = {'dosage':a, 'unite':b,'forme':c} e = DataFrame(d)
def problem_predict(pred: pd.Series) -> np.ndarray: problem_mask = pred.apply(lambda x: x.size == 0) if np.any(problem_mask): warnings.warn(f"Empty prediction for {problem_mask.sum()} objects. Replaced with highest prob class.") problem_idx = np.where(problem_mask)[0] return problem_idx
def test_apply_args(self): s = Series(['foo,bar']) result = s.apply(str.split, args=(',', )) self.assertEqual(result[0], ['foo', 'bar']) tm.assertIsInstance(result[0], list)
def _clean_conjugation_suffixes(self, pool: pd.Series) -> pd.Series: """Clean suffix that indicates how to conjugate the words.""" pool_clean: pd.Series = pool.apply(self._remove_conjugation_suffix_from_word) return pool_clean
def validate(self, series: pd.Series) -> pd.Series: return series.apply(self._validation)
def seriesTypes(s: pd.Series): return s.apply(type).value_counts()
def look_for_date(column_i: pd.Series): dates = {date: pd.to_datetime(date) for date in column_i.unique()} return column_i.apply(lambda x: dates[x])
# ``` # <markdowncell> # **A9**: # <pre> # 0 0 # 1 1 # 2 2 # 3 3 # 4 4 # </pre> # <codecell> s1.apply(lambda k: 2*k).sum() # <markdowncell> # **Q10**: What is # # ```Python # s1.apply(lambda k: 2*k).sum() # ``` # <markdowncell> # **A10**: # <pre> # 10 # </pre>
def to_complex(series: pd.Series) -> bool: return series.apply(complex)
'isAlphabet':0, 'inClauseSubst':0, 'nomSubstances':'', 'typeRecherche':0, 'choixRecherche':'medicament', 'txtCaracteres':'levothyroxine', 'btnMedic.x':9, 'btnMedic.y':15, 'btnMedic':'Rechercher', 'radLibelle':2, 'txtCaracteresSub': '', 'radLibelleSub':4 } raw_data = requests.post('http://base-donnees-publique.medicaments.gouv.fr/index.php#result',data=payload).text html = BeautifulSoup(raw_data) drugss = html.findAll('a',class_="standart") drugs = [drug.text for drug in drugss] names = Series(drugs) names.str.strip() regex_dosage = re.compile(r'\d+') regex_units = re.compile(r'(microgrammes|µg|grammes)') names['dosage'] = names.apply(lambda x: regex_dosage.findall(x)) names['units'] = names.apply(lambda x: regex_units.findall(x))
def unlist_series(s: pd.Series) -> pd.Series: return s.apply(lambda x: x[0])