def test_include_na(self, sparse, dtype): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) exp = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) exp_na = DataFrame({nan: [0, 0, 1], 'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns if sparse: exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype)) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def test_frequency_is_original(self, num_cols): # GH 22150 index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) original = index.copy() df = DataFrame(1, index=index, columns=range(num_cols)) df.apply(lambda x: x) assert index.freq == original.freq
def test_apply(self, float_frame): with np.errstate(all='ignore'): # ufunc applied = float_frame.apply(np.sqrt) tm.assert_series_equal(np.sqrt(float_frame['A']), applied['A']) # aggregator applied = float_frame.apply(np.mean) assert applied['A'] == np.mean(float_frame['A']) d = float_frame.index[0] applied = float_frame.apply(np.mean, axis=1) assert applied[d] == np.mean(float_frame.xs(d)) assert applied.index is float_frame.index # want this # invalid axis df = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) with pytest.raises(ValueError): df.apply(lambda x: x, 2) # GH 9573 df = DataFrame({'c0': ['A', 'A', 'B', 'B'], 'c1': ['C', 'C', 'D', 'D']}) df = df.apply(lambda ts: ts.astype('category')) assert df.shape == (4, 2) assert isinstance(df['c0'].dtype, CategoricalDtype) assert isinstance(df['c1'].dtype, CategoricalDtype)
def test_apply_modify_traceback(self): data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) data.loc[4, 'C'] = np.nan def transform(row): if row['C'].startswith('shin') and row['A'] == 'foo': row['D'] = 7 return row def transform2(row): if (notna(row['C']) and row['C'].startswith('shin') and row['A'] == 'foo'): row['D'] = 7 return row try: data.apply(transform, axis=1) except AttributeError as e: assert len(e.args) == 2 assert e.args[1] == 'occurred at index 4' assert e.args[0] == "'float' object has no attribute 'startswith'"
def test_with_dictlike_columns(self): # GH 17602 df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) result = df.apply(lambda x: {'s': x['a'] + x['b']}, axis=1) expected = Series([{'s': 3} for t in df.itertuples()]) assert_series_equal(result, expected) df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), pd.Timestamp('2017-05-02 00:00:00')] result = df.apply(lambda x: {'s': x['a'] + x['b']}, axis=1) assert_series_equal(result, expected) # compose a series result = (df['a'] + df['b']).apply(lambda x: {'s': x}) expected = Series([{'s': 3}, {'s': 3}]) assert_series_equal(result, expected) # GH 18775 df = DataFrame() df["author"] = ["X", "Y", "Z"] df["publisher"] = ["BBC", "NBC", "N24"] df["date"] = pd.to_datetime(['17-10-2010 07:15:30', '13-05-2011 08:20:35', '15-01-2013 09:09:09']) result = df.apply(lambda x: {}, axis=1) expected = Series([{}, {}, {}]) assert_series_equal(result, expected)
def test_apply_differently_indexed(self): df = DataFrame(np.random.randn(20, 10)) result0 = df.apply(Series.describe, axis=0) expected0 = DataFrame(dict((i, v.describe()) for i, v in compat.iteritems(df)), columns=df.columns) assert_frame_equal(result0, expected0) result1 = df.apply(Series.describe, axis=1) expected1 = DataFrame(dict((i, v.describe()) for i, v in compat.iteritems(df.T)), columns=df.index).T assert_frame_equal(result1, expected1)
def test_result_type_error(self, result_type): # allowed result_type df = DataFrame( np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) with pytest.raises(ValueError): df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type)
def test_apply_non_numpy_dtype(self): df = DataFrame({"dt": pd.date_range("2015-01-01", periods=3, tz="Europe/Brussels")}) result = df.apply(lambda x: x) assert_frame_equal(result, df) result = df.apply(lambda x: x + pd.Timedelta("1day")) expected = DataFrame({"dt": pd.date_range("2015-01-02", periods=3, tz="Europe/Brussels")}) assert_frame_equal(result, expected) df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category") result = df.apply(lambda x: x) assert_frame_equal(result, df)
def applyDataFrame(): df = DataFrame(np.arange(12).reshape(4,3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) print (df) f = lambda x: x.max() - x.min() func1 = df.apply(f, axis = 0) func2 = df.apply(f, axis = 1) print (func1) print (func2) f2 = lambda x: '%.2f' % x df.applymap(f2) print (df.applymap(f2))
def data_prep(input_file, bad_samples_file, freq_dict=None): '''prepare the ibdhmm file byremoving sites that are too close from eachother, calculating major and minor allele if specified, the freq_dict should be a json file that contains the frequencies. This is created from freq_parse.py''' min_snpD = 10 tri_allele= 0 output_file = ('.').join(input_file.split('.')[0:-2]) + '_cleaned.txt' #relaxing conditions because we only have 3000 SNPs to begin with bad_samples = [sample.strip() for sample in open(bad_samples_file)] df = DataFrame(read_csv(input_file, sep = '\t')) #remove bad samples df.drop(bad_samples, inplace = True, axis =1) #remove non-biallelic alleles #df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True) #relaxing conditions because we only have 3000 SNPs to begin with '''#remove SNPs that are too close to one another df['diff'] = df.groupby('chrom')['pos'].diff() df.fillna('first', inplace = True) #df.to_csv('test_df.txt', sep = '\t') # BUG NOTE MUST FIX THE DAISY CHAIN PROBLEM df = df.query('diff > 10 or diff == "first"') df.drop('diff', axis = 1, inplace = True)''' if not freq_dict: #calculate the major and minor allele major = df.apply(major_find, axis =1 ) minor = df.apply(minor_find, axis =1 ) major_prop = df.apply(major_prop_find, axis =1 ) minor_prop = df.apply(minor_prop_find, axis = 1) else: snp_dict = json.load(open(freq_dict)) df['keys'] = df['chrom'].map(str) +':'+ df['pos'].map(str) major = df['keys'].apply(lambda x : snp_dict[x]['major']) major_prop = df['keys'].apply(lambda x : snp_dict[x]['major_freq']) minor = df['keys'].apply(lambda x : snp_dict[x]['minor']) minor_prop = df['keys'].apply(lambda x : snp_dict[x]['minor_freq']) df.drop('keys', inplace= True, axis = 1) #inserting this stuff into dataframe for future use df.insert(3, 'minor_prop', minor_prop) df.insert(3, 'minor', minor) df.insert(3, 'major_prop', major_prop) df.insert(3, 'major', major) df.to_csv(output_file, sep = '\t', index= False) return df
def test_consistent_coerce_for_shapes(self): # we want column names to NOT be propagated # just because the shape matches the input shape df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C']) result = df.apply(lambda x: [1, 2, 3], axis=1) expected = Series([[1, 2, 3] for t in df.itertuples()]) assert_series_equal(result, expected) result = df.apply(lambda x: [1, 2], axis=1) expected = Series([[1, 2] for t in df.itertuples()]) assert_series_equal(result, expected)
def peakToTroughs(dailyret,dates): ''' Example: sr = s['retdat'] stkd = s['stockData'] dt = stkd['Date'] ptk = peakToTroughs(sr,dt) ''' ''' get cummulative percent changes''' drs = Series(dailyret) soc1dr = drs+1 soc1cumdr = soc1dr.cumprod() localPeaksPairs = peakdetect(y_axis=soc1cumdr,lookahead=1)[0] indexOfLocalPeaks = np.empty(len(localPeaksPairs)); for i in range(len(indexOfLocalPeaks)): indexOfLocalPeaks[i] = localPeaksPairs[i][0] # data frame with 2 columns, where column 1 is a peak, and column 2 is the next peak that follows it dd = DataFrame({'a':indexOfLocalPeaks[0:(len(indexOfLocalPeaks)-1)],'b':indexOfLocalPeaks[1:len(indexOfLocalPeaks)]}) # add one more row to dd to represent the last peak and last row of soc1cumdr, so # that you calculate the last possible trough, if it there was one between the last peak and the last day # of data lastDdValue = dd.iloc[len(dd)-1,1] lastValueInData = len(soc1cumdr)-1 dd = rbind(dd,[lastDdValue,lastValueInData]) def minBetween2Peaks(x): lowindex = int(x[0]) highindex = int(x[1]) minval = min(soc1cumdr[lowindex:(highindex+1)]) return minval localMins = dd.apply(minBetween2Peaks,1) localMins.index = range(len(localMins)) localPeaks = soc1cumdr[indexOfLocalPeaks.astype(int)] localPeaks.index = range(len(localPeaks)) diffs = (localMins - localPeaks)/localPeaks # get indices of localMins in soc1cumdr so that you can get their dates def ff(x): ''' this function gets the index of soc1cumdr whose value = x''' r = soc1cumdr[soc1cumdr==x].index[0] return r indexOfLocalMins = map(ff,localMins) datesOfLocalMins = Series(dates)[indexOfLocalMins] datesOfLocalMins.index = range(len(datesOfLocalMins)) # calculate peak to end of data def minBetweenPeakAndEnd(x): arr = soc1cumdr.iloc[x[0]:len(soc1cumdr)] return min(arr) absMinsToEnd = dd.apply(minBetweenPeakAndEnd,1) absMinsToEnd.index = range(len(absMinsToEnd)) diffsToEnd = (absMinsToEnd - localPeaks)/localPeaks ret = DataFrame({'Date':datesOfLocalMins,'Peak':localPeaks,'Valley':localMins,'Diff':diffs,'DiffToEnd':diffsToEnd}) return ret
def __init__(self, background: pd.DataFrame, permutations: int=100): """ :param background: A data frame containing all the observations as binary data 1 and 0 or True and False where rows represent observations and columns represent samples. :param permutations: how many permutations by default :return: """ self.permutations = permutations self.background = background self.sample_weights = background.apply(sum) / background.apply(sum).pipe(sum) self.cummulative_sum = np.cumsum(self.sample_weights) self.sample_indices = [x for x in range(0, background.shape[1])]
def test(): frame = DataFrame(numpy.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) format = lambda x: '%.2f' % x range = lambda x: x.max() - x.min() # http://stackoverflow.com/questions/19798153/difference-between-map-applymap-and-apply-methods-in-pandas/19798528#19798528 print(frame.apply(range)) print("") print(frame.applymap(format)) print("") print(frame.apply(range).map(format)) return frame
def test_with_dictlike_columns_with_infer(self): # GH 17602 df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) result = df.apply(lambda x: {'s': x['a'] + x['b']}, axis=1, result_type='expand') expected = DataFrame({'s': [3, 3]}) assert_frame_equal(result, expected) df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), pd.Timestamp('2017-05-02 00:00:00')] result = df.apply(lambda x: {'s': x['a'] + x['b']}, axis=1, result_type='expand') assert_frame_equal(result, expected)
def test_consistency_for_boxed(self, box): # passing an array or list should not affect the output shape df = DataFrame( np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) result = df.apply(lambda x: box([1, 2]), axis=1) expected = Series([box([1, 2]) for t in df.itertuples()]) assert_series_equal(result, expected) result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand') expected = DataFrame( np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1) assert_frame_equal(result, expected)
def test_apply_non_numpy_dtype(self): df = DataFrame({'dt': pd.date_range( "2015-01-01", periods=3, tz='Europe/Brussels')}) result = df.apply(lambda x: x) assert_frame_equal(result, df) result = df.apply(lambda x: x + pd.Timedelta('1day')) expected = DataFrame({'dt': pd.date_range( "2015-01-02", periods=3, tz='Europe/Brussels')}) assert_frame_equal(result, expected) df = DataFrame({'dt': ['a', 'b', 'c', 'a']}, dtype='category') result = df.apply(lambda x: x) assert_frame_equal(result, df)
def compute_confusion_matrix(target, predicted, normalize=True, sort = True): """ returns a confusion matrix as a data frame with labels Parameters: target (array): The values that are predicted. predicted (array): predicted values. normalize (bool): If True, Normalize normalize (bool): If true sort by value. Returns (DataFrame): df with the confusion matrix. """ # Determine the uniqu values in the target list, sort them and assign as labels. labels = np.unique(list(target)) labels.sort() # Compute the confusion matrix, place into data frame and normailize if desired. confusion = metrics.confusion_matrix(target, predicted, labels) confusion = DataFrame(confusion, index=labels, columns=labels) if normalize: confusion = confusion.apply(lambda x: x / np.sum(x), axis=1) # if sort is true: find the max value for each and then sort, the confusion matrix if sort: #get the max values, order and then use to order the confusion matrix on both axes max_values =confusion.max(axis = 1) max_values.sort(inplace = True, ascending=False) order = max_values.index confusion = confusion.loc[order,order] return confusion
def test_apply(self): with np.errstate(all="ignore"): # ufunc applied = self.frame.apply(np.sqrt) assert_series_equal(np.sqrt(self.frame["A"]), applied["A"]) # aggregator applied = self.frame.apply(np.mean) self.assertEqual(applied["A"], np.mean(self.frame["A"])) d = self.frame.index[0] applied = self.frame.apply(np.mean, axis=1) self.assertEqual(applied[d], np.mean(self.frame.xs(d))) self.assertIs(applied.index, self.frame.index) # want this # invalid axis df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) self.assertRaises(ValueError, df.apply, lambda x: x, 2) # GH9573 df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]}) df = df.apply(lambda ts: ts.astype("category")) self.assertEqual(df.shape, (4, 2)) self.assertTrue(isinstance(df["c0"].dtype, CategoricalDtype)) self.assertTrue(isinstance(df["c1"].dtype, CategoricalDtype))
def test_apply_empty_infer_type(self): no_cols = DataFrame(index=['a', 'b', 'c']) no_index = DataFrame(columns=['a', 'b', 'c']) def _check(df, f): with warnings.catch_warnings(record=True): test_res = f(np.array([], dtype='f8')) is_reduction = not isinstance(test_res, np.ndarray) def _checkit(axis=0, raw=False): res = df.apply(f, axis=axis, raw=raw) if is_reduction: agg_axis = df._get_agg_axis(axis) tm.assertIsInstance(res, Series) self.assertIs(res.index, agg_axis) else: tm.assertIsInstance(res, DataFrame) _checkit() _checkit(axis=1) _checkit(raw=True) _checkit(axis=0, raw=True) with np.errstate(all='ignore'): _check(no_cols, lambda x: x) _check(no_cols, lambda x: x.mean()) _check(no_index, lambda x: x) _check(no_index, lambda x: x.mean()) result = no_cols.apply(lambda x: x.mean(), broadcast=True) tm.assertIsInstance(result, DataFrame)
def test_apply_mixed_dtype_corner(self): df = DataFrame({"A": ["foo"], "B": [1.0]}) result = df[:0].apply(np.mean, axis=1) # the result here is actually kind of ambiguous, should it be a Series # or a DataFrame? expected = Series(np.nan, index=pd.Index([], dtype="int64")) assert_series_equal(result, expected) df = DataFrame({"A": ["foo"], "B": [1.0]}) result = df.apply(lambda x: x["A"], axis=1) expected = Series(["foo"], index=[0]) assert_series_equal(result, expected) result = df.apply(lambda x: x["B"], axis=1) expected = Series([1.0], index=[0]) assert_series_equal(result, expected)
def test_apply(self): with np.errstate(all='ignore'): # ufunc applied = self.frame.apply(np.sqrt) assert_series_equal(np.sqrt(self.frame['A']), applied['A']) # aggregator applied = self.frame.apply(np.mean) self.assertEqual(applied['A'], np.mean(self.frame['A'])) d = self.frame.index[0] applied = self.frame.apply(np.mean, axis=1) self.assertEqual(applied[d], np.mean(self.frame.xs(d))) self.assertIs(applied.index, self.frame.index) # want this # invalid axis df = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) self.assertRaises(ValueError, df.apply, lambda x: x, 2) # GH9573 df = DataFrame({'c0': ['A', 'A', 'B', 'B'], 'c1': ['C', 'C', 'D', 'D']}) df = df.apply(lambda ts: ts.astype('category')) self.assertEqual(df.shape, (4, 2)) self.assertTrue(isinstance(df['c0'].dtype, CategoricalDtype)) self.assertTrue(isinstance(df['c1'].dtype, CategoricalDtype))
def get_flights_from_route(cur, origin, destination): """ Returns a dataframe for all flights matching origin, destination. """ import time ### MySQL query time0 = time.time() cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, UniqueCarrier, ArrDelay FROM flights_100000 WHERE Origin = %s and Dest = %s;", (origin, destination)) rows = cur.fetchall() td = time.time() - time0 print 'Database query took %.2f seconds.' % td ### Convert to dataframe df = DataFrame(list(rows), columns=['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'Carrier', 'ArrDelay']) ### Drop columns without delays (cancellations) df = df.dropna() ### Create some auxiliary columns df['DayOfYear'] = df.apply( lambda x: datetime.datetime(x['Year'],x['Month'],x['DayOfMonth']).timetuple().tm_yday, axis=1) df['Week'] = df['DayOfYear'] / 7 + 1 df['DepHour'] = df['CRSDepTime']/100 ### Drop unused columns df = df.drop(['DayOfMonth','CRSDepTime'],axis=1).sort_index(axis=1) ## df.head() return df
def avg_medal_count(): ''' Using the dataframe's apply method, create a new Series called avg_medal_count that indicates the average number of gold, silver, and bronze medals earned amongst countries who earned at least one medal of any kind at the 2014 Sochi olympics. Note that the countries list already only includes countries that have earned at least one medal. No additional filtering is necessary. You do not need to call the function in your code when running it in the browser - the grader will do that automatically when you submit or test it. ''' countries = ['Russian Fed.', 'Norway', 'Canada', 'United States', 'Netherlands', 'Germany', 'Switzerland', 'Belarus', 'Austria', 'France', 'Poland', 'China', 'Korea', 'Sweden', 'Czech Republic', 'Slovenia', 'Japan', 'Finland', 'Great Britain', 'Ukraine', 'Slovakia', 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan'] gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0] bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1] olympic_medal_counts = {'gold': Series(gold), 'silver': Series(silver), 'bronze': Series(bronze)} df = DataFrame(olympic_medal_counts) # YOUR CODE HERE avg_medal_count = df.apply(numpy.mean, axis=0) print avg_medal_count
def customer_lifetime_value(self, transaction_prediction_model, frequency, recency, T, monetary_value, time=12, discount_rate=1): """ This method computes the average lifetime value for a group of one or more customers. transaction_prediction_model: the model to predict future transactions, literature uses pareto/ndb but we can also use a different model like bg frequency: the frequency vector of customers' purchases (denoted x in literature). recency: the recency vector of customers' purchases (denoted t_x in literature). T: the vector of customers' age (time since first purchase) monetary_value: the monetary value vector of customer's purchases (denoted m in literature). time: the lifetime expected for the user in months. Default: 12 discount_rate: the monthly adjusted discount rate. Default: 1 Returns: the conditional expectation of the average profit per transaction. Also creates a discounted_monthly_cash_flows attribute """ df = DataFrame() df['frequency'] = frequency df['recency'] = recency df['T'] = T d = discount_rate m = self.conditional_expected_average_profit() discounted_monthly_cash_flows = [] for i in range(30, (time*30)+1, 30): df['expected_revenues_period_'+str(i)] = df.apply( lambda r: (m*transaction_prediction_model.predict(i, r['frequency'], r['recency'], r['T'])/(1+d)**(i/30)), axis=1 ) discounted_monthly_cash_flows.append(df['expected_revenues_period_'+str(i)].sum()) return sum(discounted_monthly_cash_flows)
def pastas_hook(obj): for key, value in obj.items(): if key in ["tmin", "tmax", "date_modified", "date_created"]: val = Timestamp(value) if val is NaT: val = None obj[key] = val elif key == "series": try: obj[key] = read_json(value, typ='series', orient="split") except: try: obj[key] = TimeSeries(**value) except: obj[key] = value elif key == "time_offset": obj[key] = Timedelta(value) elif key == "parameters": # Necessary to maintain order when using the JSON format! value = json.loads(value, object_pairs_hook=OrderedDict) param = DataFrame(data=value, columns=value.keys()).T obj[key] = param.apply(to_numeric, errors="ignore") else: try: obj[key] = json.loads(value, object_hook=pastas_hook) except: obj[key] = value return obj
def test_apply_modify_traceback(self): data = DataFrame( { "A": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], "B": ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], "C": ["dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny"], "D": np.random.randn(11), "E": np.random.randn(11), "F": np.random.randn(11), } ) data.loc[4, "C"] = np.nan def transform(row): if row["C"].startswith("shin") and row["A"] == "foo": row["D"] = 7 return row def transform2(row): if notnull(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": row["D"] = 7 return row try: transformed = data.apply(transform, axis=1) # noqa except AttributeError as e: self.assertEqual(len(e.args), 2) self.assertEqual(e.args[1], "occurred at index 4") self.assertEqual(e.args[0], "'float' object has no attribute 'startswith'")
def test_apply_bug(self): # GH 6125 positions = pd.DataFrame( [[1, "ABC0", 50], [1, "YUM0", 20], [1, "DEF0", 20], [2, "ABC1", 50], [2, "YUM1", 20], [2, "DEF1", 20]], columns=["a", "market", "position"], ) def f(r): return r["market"] expected = positions.apply(f, axis=1) positions = DataFrame( [ [datetime(2013, 1, 1), "ABC0", 50], [datetime(2013, 1, 2), "YUM0", 20], [datetime(2013, 1, 3), "DEF0", 20], [datetime(2013, 1, 4), "ABC1", 50], [datetime(2013, 1, 5), "YUM1", 20], [datetime(2013, 1, 6), "DEF1", 20], ], columns=["a", "market", "position"], ) result = positions.apply(f, axis=1) assert_series_equal(result, expected)
def test_apply_mixed_datetimelike(self): # mixed datetimelike # GH 7778 df = DataFrame({'A': date_range('20130101', periods=3), 'B': pd.to_timedelta(np.arange(3), unit='s')}) result = df.apply(lambda x: x, axis=1) assert_frame_equal(result, df)
def _grading_policy(self): ''' Gets the grading policy from the course policy. Returns ------- grading_policy : DataFrame Information about how grades are determined in a course. ''' course_policy = self._xd.get('grading_policy') grading_policy = DataFrame(course_policy.iloc[0,:]['GRADER']) # type == the gformat of sequences grading_policy = grading_policy.set_index('type') def max_seqs(seq_type): ''' Determines the max number of sequenences that should contribute to a users final grade for each gformat ''' return seq_type.get('min_count', 1) - seq_type.get('drop_count', 0) grading_policy['max_seqs'] = grading_policy.apply(max_seqs, axis = 1) return grading_policy
def test_apply_modify_traceback(self): data = DataFrame({ "A": [ "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo", ], "B": [ "one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one", ], "C": [ "dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny", ], "D": np.random.randn(11), "E": np.random.randn(11), "F": np.random.randn(11), }) data.loc[4, "C"] = np.nan def transform(row): if row["C"].startswith("shin") and row["A"] == "foo": row["D"] = 7 return row def transform2(row): if notna(row["C"]) and row["C"].startswith( "shin") and row["A"] == "foo": row["D"] = 7 return row try: data.apply(transform, axis=1) except AttributeError as e: assert len(e.args) == 2 assert e.args[1] == "occurred at index 4" assert e.args[0] == "'float' object has no attribute 'startswith'"
def test_str_accessor_in_apply_func(): # https://github.com/pandas-dev/pandas/issues/38979 df = DataFrame(zip("abc", "def")) expected = Series(["A/D", "B/E", "C/F"]) result = df.apply(lambda f: "/".join(f.str.upper()), axis=1) tm.assert_series_equal(result, expected)
def prepare_training_data(training_data: pd.DataFrame, logger: logging.getLogger(), filter_data_top_n=0) -> pd.DataFrame: """Adds columns: [bm25_scores_encoded, use_scores encoded, scores_concatenated] Parameters ---------- training_data: pd.DataFrame Needs to have columns ['bm25_class_labels', 'bm25_scores', 'use_class_labels', 'use_scores'] Columns are lists encoded as string at this moment -> will be converted to lists filter_data_top_n: integer if n >= 0, will filter all datapoints where correct result was under first n responses in USE and BM25 Returns ------- training_data: pd.DataFrame The df extended by [bm25_scores_encoded, use_scores encoded, scores_concatenated] """ # evaluate the list values in the dataframe logger.info("# evaluate the list values in the dataframe") for column in [ 'bm25_class_labels', 'bm25_scores', 'use_class_labels', 'use_scores' ]: try: training_data[column] = training_data[column].apply( ast.literal_eval) except ValueError: print(f"Column already in right format") # infer the number of classes logger.info("# infer the number of classes") num_of_classes = training_data.class_label.nunique() # encode scores and write new columns logger.info("# encode scores and write new columns") training_data["bm25_scores_encoded"] = training_data[[ "bm25_class_labels", "bm25_scores" ]].apply(lambda x: encode_ids_and_scores(x, num_of_classes), axis=1) training_data["use_scores_encoded"] = training_data[[ "use_class_labels", "use_scores" ]].apply(lambda x: encode_ids_and_scores(x, num_of_classes), axis=1) training_data["scores_concatenated"] = training_data[[ "bm25_scores_encoded", "use_scores_encoded" ]].apply(lambda row: np.concatenate((row[0], row[1])), axis=1) # this function filters out the training data by only taking those datapoints of USE and BM25 # where the right result has been under top_n responses # don't train on something where there is absolute missing information def filter_result_not_found_under_n_responses(row, n): class_ = row["class_label"] if class_ not in row["bm25_class_labels"][:n] or class_ not in row[ "use_class_labels"][:n]: return False else: return True if filter_data_top_n: logger.info("# Filter datapoints") training_data = training_data[training_data.apply( lambda row: filter_result_not_found_under_n_responses( row, filter_data_top_n), axis=1)] return training_data
def recreate_sampling_times( data: DataFrame, step_length: float, start_time: float, end_time: float, plot_col=None, ) -> DataFrame: """ Functions that transforms measurement data with samples taken it any (possibly irregular) sample rate and outputs the same measurements evenly spanced according to a given step length. data: dataframe with numeric values that includes a 'Time' column step length: desired time between each sample timestep duration: amount of time covered by measurements in data plot_col: name of column that should be plotted before and after (for vertification purposes) """ first_time_in_df = data[DFKeys.TIME.value].iloc[0] if start_time < first_time_in_df: raise ValueError("start time cannot precede first time in df") get_shifted_time = lambda row: row[DFKeys.TIME.value] - start_time shifted_timestamps = data.apply(get_shifted_time, axis=1).rename(DFKeys.TIME.value, axis=1) duration = end_time - start_time timesteps = np.arange(0, duration, step_length) new_columns = [pd.Series(timesteps, name=DFKeys.TIME.value)] columns_except_time = data.columns.difference([ DFKeys.TIME.value, "child_frame_id", "header.frame_id", "header.seq", "header.stamp.nsecs", "header.stamp.secs", "pose.covariance", "twist.covariance", "pins_0", "pins_1", "pins_2", "pins_3", "pins_4", "pins_5", "pins_6", "pins_7", ]) for col_name in columns_except_time: f = interp1d(shifted_timestamps.values, data[col_name].values) new_columns.append(pd.Series(f(timesteps), name=col_name)) data_new = pd.concat(new_columns, axis=1) if plot_col in data.columns: SAVEDIR = Path("results/interpolation") sea.set_style("white") # plt.figure(figsize=(5, 2.5)) sea.lineplot(x=shifted_timestamps.values, y=data[plot_col], label="original") sea.lineplot(x=DFKeys.TIME.value, y=plot_col, data=data_new, label="interpolated") # plt.ylabel("Velocity") # plt.savefig(SAVEDIR.joinpath("%s.pdf" % plot_col)) plt.show() return data_new
def _apply(self, df: pd.DataFrame) -> pd.DataFrame: return df[df.apply(self.condition, axis=1)]
def prepare_data(test, traces, options): std_out('Preparing data for plot') # Dataframe to return df = DataFrame() # Check if there are different subplots n_subplots = 1 for trace in traces: if 'subplot' in traces[trace].keys(): n_subplots = max(n_subplots, traces[trace]['subplot']) else: std_out(f'Trace {trace} not assigned to subplot. Skipping', 'WARNING') std_out(f'Making {n_subplots} subplots') # Generate list of subplots subplots = [[] for x in range(n_subplots)] # Put data in the df for trace in traces.keys(): if 'subplot' not in traces[trace].keys(): std_out( f'The trace {traces[trace]} was not placed in any subplot. Assuming subplot #1', 'WARNING') traces[trace]['subplot'] = 1 ndevs = traces[trace]['devices'] nchans = traces[trace]['channel'] # Make them lists always if ndevs == 'all': devices = list(test.devices.keys()) elif type(ndevs) == str or type(ndevs) == int: devices = [ndevs] else: devices = ndevs for device in devices: ndev = str(device) # Make them lists always if nchans == 'all': channels = list(test.devices[ndev].readings.columns) elif type(nchans) == str: channels = [nchans] else: channels = nchans for channel in channels: # Check if device is in columns if channel not in test.devices[ndev].readings.columns: std_out( f'The device {ndev} does not contain {channel}. Ignoring', 'WARNING') continue # Put channel in subplots subplots[traces[trace]['subplot'] - 1].append(channel + '_' + ndev) column_orig = [channel] columns_add = [channel + '_' + ndev] # Add filtering name to dfdev if 'filter' in traces[trace]: col_name = traces[trace]['filter']['col'] if col_name not in test.devices[ndev].readings.columns: std_out( f'Column {col_name} not in dataframe. Ignoring filtering', 'WARNING') else: column_orig.append(col_name) columns_add.append(col_name) # Device dataframe dfdev = DataFrame( test.devices[ndev].readings[column_orig].values, columns=columns_add, index=test.devices[ndev].readings.index) # Add filtering function if 'filter' in traces[trace]: value = traces[trace]['filter']['value'] relationship = traces[trace]['filter']['relationship'] if col_name in dfdev.columns: if relationship == '==': dfdev.loc[dfdev[col_name] == value] elif relationship == '<=': dfdev.loc[dfdev[col_name] <= value] elif relationship == '>=': dfdev.loc[dfdev[col_name] >= value] elif relationship == '<': dfdev.loc[dfdev[col_name] < value] elif relationship == '>': dfdev.loc[dfdev[col_name] > value] else: std_out( f"Not valid relationship. Valid options: '==', '<=', '>=', '<', '>'", 'ERROR') continue # Remove column for filtering from dfdev dfdev.drop(columns=[col_name], inplace=True) # Combine it in the df df = df.combine_first(dfdev) # Add average or other extras # TODO Check this to simplify # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.resample.Resampler.aggregate.html if 'extras' in traces[trace]: for extra in traces[trace]['extras']: extra_name = channel + f'-{extra.upper()}' sbl = subplots[traces[trace]['subplot'] - 1] if extra == 'max': df[extra_name] = df.loc[:, sbl].max(axis=1) if extra == 'mean': df[extra_name] = df.loc[:, sbl].mean(axis=1) if extra == 'min': df[extra_name] = df.loc[:, sbl].min(axis=1) subplots[traces[trace]['subplot'] - 1].append(extra_name) # Trim data if options['min_date'] is not None: df = df[df.index > options['min_date']] if options['max_date'] is not None: df = df[df.index < options['max_date']] # Make sure everything is numeric before resampling # https://stackoverflow.com/questions/34257069/resampling-pandas-dataframe-is-deleting-column#34270422 df = df.apply(to_numeric, errors='coerce') # Resample it if options['frequency'] is not None: std_out(f"Resampling at {options['frequency']}", "INFO") if 'resample' in options: if options['resample'] == 'max': df = df.resample(options['frequency']).max() if options['resample'] == 'min': df = df.resample(options['frequency']).min() if options['resample'] == 'mean': df = df.resample(options['frequency']).mean() else: df = df.resample(options['frequency']).mean() # Clean na if options['clean_na'] is not None: if options['clean_na'] == 'fill': df = df.fillna(method='ffill') if options['clean_na'] == 'drop': df.dropna(axis=0, how='any', inplace=True) if df.empty: std_out('Dataframe for selected options is empty', 'WARNING') return df, subplots
print(df.cumsum()) # 积累型方法 print(df.describe()) obj = Series(['a', 'a', 'b', 'c']*4) print(obj) print(obj.sort_index()) print(obj.describe()) """ # 5.3.1相关性和协方差 # 5.3.2唯一值、计数和成员属性 obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) uniques = obj.unique() print(uniques) print(obj.value_counts()) print(pd.value_counts(obj.values, sort=False)) mask = obj.isin(['b', 'c']) print(mask) print(obj[mask]) to_mach = Series(['c', 'a', 'b', 'b', 'c', 'a']) unique_vals = Series(['c', 'b', 'a']) print(pd.Index(unique_vals).get_indexer(to_mach)) data = DataFrame({'Qu1' : [1, 3, 4, 3, 4], 'Qu2' : [2, 3, 1, 2, 3], 'Qu3' : [1, 5, 2, 4, 4]}) print(data) print(data.apply(pd.value_counts).fillna(0))
votes_rcvd = { 'Candidate': ['Khan', 'Correy', 'Li', 'OTooley'], 'Votes': [2218231, 704200, 492940, 105630] } votes_df = DataFrame(votes_rcvd, columns=['Candidate', 'Votes']) votes_df.set_index('Candidate') # In[124]: #Add column showing percentage of total vote votes_df["Vote_Percentage"] = (votes_df['Votes'] / total_votes) * 100 votes_df.set_index('Candidate') # In[125]: total = votes_df.apply(np.sum) total['Candidate'] = 'total' votes_df.append(pd.DataFrame(total.values, index=total.keys()).T, ignore_index=True) # In[126]: totaled_votes_df = votes_df.append( { 'Candidate': 'Total', 'Votes': total_votes, 'Vote_Percentage': '100' }, ignore_index=True) totaled_votes_df.set_index('Candidate')
#导入模块: from pandas import DataFrame import pandas as pd import numpy as np #生成DataFrame数据 df = DataFrame(np.random.randn(4, 5), columns=['A', 'B', 'C', 'D', 'E']) # DataFrame数据预览: # A B C D E # 0 0.673092 0.230338 -0.171681 0.312303 -0.184813 # 1 -0.504482 -0.344286 -0.050845 -0.811277 -0.298181 # 2 0.542788 0.207708 0.651379 -0.656214 0.507595 # 3 -0.249410 0.131549 -2.198480 -0.437407 1.628228 #计算各列数据总和并作为新列添加到末尾 df['Col_sum'] = df.apply(lambda x: x.sum(), axis=1) #df['Col_sum'] = df.apply(lambda x: x[0], axis=1) print(df) #计算各行数据总和并作为新行添加到末尾 df.loc['Row_sum'] = df.apply(lambda x: x.sum()) #df.loc['Row_sum'] = df.apply(lambda x: x[0]) print(df) # 最终数据结果: # A B C D E Col_sum # 0 0.673092 0.230338 -0.171681 0.312303 -0.184813 0.859238 # 1 -0.504482 -0.344286 -0.050845 -0.811277 -0.298181 -2.009071 # 2 0.542788 0.207708 0.651379 -0.656214 0.507595 1.253256 # 3 -0.249410 0.131549 -2.198480 -0.437407 1.628228 -1.125520 # Row_sum 0.461987 0.225310 -1.769627 -1.592595 1.652828 -1.022097
class TestMoments(unittest.TestCase): _multiprocess_can_split_ = True _nan_locs = np.arange(20, 40) _inf_locs = np.array([]) def setUp(self): arr = randn(N) arr[self._nan_locs] = np.NaN self.arr = arr self.rng = bdate_range(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) def test_centered_axis_validation(self): # ok mom.rolling_mean(Series(np.ones(10)),3,center=True ,axis=0) # bad axis self.assertRaises(ValueError, mom.rolling_mean,Series(np.ones(10)),3,center=True ,axis=1) # ok ok mom.rolling_mean(DataFrame(np.ones((10,10))),3,center=True ,axis=0) mom.rolling_mean(DataFrame(np.ones((10,10))),3,center=True ,axis=1) # bad axis self.assertRaises(ValueError, mom.rolling_mean,DataFrame(np.ones((10,10))),3,center=True ,axis=2) def test_rolling_sum(self): self._check_moment_func(mom.rolling_sum, np.sum) def test_rolling_count(self): counter = lambda x: np.isfinite(x).astype(float).sum() self._check_moment_func(mom.rolling_count, counter, has_min_periods=False, preserve_nan=False, fill_value=0) def test_rolling_mean(self): self._check_moment_func(mom.rolling_mean, np.mean) def test_cmov_mean(self): try: from scikits.timeseries.lib import cmov_mean except ImportError: raise nose.SkipTest vals = np.random.randn(10) xp = cmov_mean(vals, 5) rs = mom.rolling_mean(vals, 5, center=True) assert_almost_equal(xp.compressed(), rs[2:-2]) assert_almost_equal(xp.mask, np.isnan(rs)) xp = Series(rs) rs = mom.rolling_mean(Series(vals), 5, center=True) assert_series_equal(xp, rs) def test_cmov_window(self): try: from scikits.timeseries.lib import cmov_window except ImportError: raise nose.SkipTest vals = np.random.randn(10) xp = cmov_window(vals, 5, 'boxcar') rs = mom.rolling_window(vals, 5, 'boxcar', center=True) assert_almost_equal(xp.compressed(), rs[2:-2]) assert_almost_equal(xp.mask, np.isnan(rs)) xp = Series(rs) rs = mom.rolling_window(Series(vals), 5, 'boxcar', center=True) assert_series_equal(xp, rs) def test_cmov_window_corner(self): try: from scikits.timeseries.lib import cmov_window except ImportError: raise nose.SkipTest # all nan vals = np.empty(10, dtype=float) vals.fill(np.nan) rs = mom.rolling_window(vals, 5, 'boxcar', center=True) self.assert_(np.isnan(rs).all()) # empty vals = np.array([]) rs = mom.rolling_window(vals, 5, 'boxcar', center=True) self.assert_(len(rs) == 0) # shorter than window vals = np.random.randn(5) rs = mom.rolling_window(vals, 10, 'boxcar') self.assert_(np.isnan(rs).all()) self.assert_(len(rs) == 5) def test_cmov_window_frame(self): try: from scikits.timeseries.lib import cmov_window except ImportError: raise nose.SkipTest # DataFrame vals = np.random.randn(10, 2) xp = cmov_window(vals, 5, 'boxcar') rs = mom.rolling_window(DataFrame(vals), 5, 'boxcar', center=True) assert_frame_equal(DataFrame(xp), rs) def test_cmov_window_na_min_periods(self): try: from scikits.timeseries.lib import cmov_window except ImportError: raise nose.SkipTest # min_periods vals = Series(np.random.randn(10)) vals[4] = np.nan vals[8] = np.nan xp = mom.rolling_mean(vals, 5, min_periods=4, center=True) rs = mom.rolling_window(vals, 5, 'boxcar', min_periods=4, center=True) assert_series_equal(xp, rs) def test_cmov_window_regular(self): try: from scikits.timeseries.lib import cmov_window except ImportError: raise nose.SkipTest win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', 'blackmanharris', 'nuttall', 'barthann'] for wt in win_types: vals = np.random.randn(10) xp = cmov_window(vals, 5, wt) rs = mom.rolling_window(Series(vals), 5, wt, center=True) assert_series_equal(Series(xp), rs) def test_cmov_window_special(self): try: from scikits.timeseries.lib import cmov_window except ImportError: raise nose.SkipTest win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., 'width': 2.}, {'width': 0.5}] for wt, k in zip(win_types, kwds): vals = np.random.randn(10) xp = cmov_window(vals, 5, (wt,) + tuple(k.values())) rs = mom.rolling_window(Series(vals), 5, wt, center=True, **k) assert_series_equal(Series(xp), rs) def test_rolling_median(self): self._check_moment_func(mom.rolling_median, np.median) def test_rolling_min(self): self._check_moment_func(mom.rolling_min, np.min) a = np.array([1, 2, 3, 4, 5]) b = mom.rolling_min(a, window=100, min_periods=1) assert_almost_equal(b, np.ones(len(a))) self.assertRaises(ValueError, mom.rolling_min, np.array([1, 2, 3]), window=3, min_periods=5) def test_rolling_max(self): self._check_moment_func(mom.rolling_max, np.max) a = np.array([1, 2, 3, 4, 5]) b = mom.rolling_max(a, window=100, min_periods=1) assert_almost_equal(a, b) self.assertRaises(ValueError, mom.rolling_max, np.array([1, 2, 3]), window=3, min_periods=5) def test_rolling_quantile(self): qs = [.1, .5, .9] def scoreatpercentile(a, per): values = np.sort(a, axis=0) idx = per / 1. * (values.shape[0] - 1) return values[int(idx)] for q in qs: def f(x, window, min_periods=None, freq=None, center=False): return mom.rolling_quantile(x, window, q, min_periods=min_periods, freq=freq, center=center) def alt(x): return scoreatpercentile(x, q) self._check_moment_func(f, alt) def test_rolling_apply(self): ser = Series([]) assert_series_equal( ser, mom.rolling_apply(ser, 10, lambda x: x.mean())) def roll_mean(x, window, min_periods=None, freq=None, center=False): return mom.rolling_apply(x, window, lambda x: x[np.isfinite(x)].mean(), min_periods=min_periods, freq=freq, center=center) self._check_moment_func(roll_mean, np.mean) def test_rolling_apply_out_of_bounds(self): # #1850 arr = np.arange(4) # it works! result = mom.rolling_apply(arr, 10, np.sum) self.assert_(isnull(result).all()) result = mom.rolling_apply(arr, 10, np.sum, min_periods=1) assert_almost_equal(result, result) def test_rolling_std(self): self._check_moment_func(mom.rolling_std, lambda x: np.std(x, ddof=1)) self._check_moment_func(functools.partial(mom.rolling_std, ddof=0), lambda x: np.std(x, ddof=0)) def test_rolling_std_1obs(self): result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), 1, min_periods=1) expected = np.zeros(5) assert_almost_equal(result, expected) result = mom.rolling_std(np.array([np.nan, np.nan, 3., 4., 5.]), 3, min_periods=2) self.assert_(np.isnan(result[2])) def test_rolling_std_neg_sqrt(self): # unit test from Bottleneck # Test move_nanstd for neg sqrt. a = np.array([0.0011448196318903589, 0.00028718669878572767, 0.00028718669878572767, 0.00028718669878572767, 0.00028718669878572767]) b = mom.rolling_std(a, window=3) self.assert_(np.isfinite(b[2:]).all()) b = mom.ewmstd(a, span=3) self.assert_(np.isfinite(b[2:]).all()) def test_rolling_var(self): self._check_moment_func(mom.rolling_var, lambda x: np.var(x, ddof=1)) self._check_moment_func(functools.partial(mom.rolling_var, ddof=0), lambda x: np.var(x, ddof=0)) def test_rolling_skew(self): try: from scipy.stats import skew except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_skew, lambda x: skew(x, bias=False)) def test_rolling_kurt(self): try: from scipy.stats import kurtosis except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_kurt, lambda x: kurtosis(x, bias=False)) def test_fperr_robustness(self): # TODO: remove this once python 2.5 out of picture if PY3: raise nose.SkipTest # #2114 data = '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1a@\xaa\xaa\xaa\xaa\xaa\xaa\x02@8\x8e\xe38\x8e\xe3\xe8?z\t\xed%\xb4\x97\xd0?\xa2\x0c<\xdd\x9a\x1f\xb6?\x82\xbb\xfa&y\x7f\x9d?\xac\'\xa7\xc4P\xaa\x83?\x90\xdf\xde\xb0k8j?`\xea\xe9u\xf2zQ?*\xe37\x9d\x98N7?\xe2.\xf5&v\x13\x1f?\xec\xc9\xf8\x19\xa4\xb7\x04?\x90b\xf6w\x85\x9f\xeb>\xb5A\xa4\xfaXj\xd2>F\x02\xdb\xf8\xcb\x8d\xb8>.\xac<\xfb\x87^\xa0>\xe8:\xa6\xf9_\xd3\x85>\xfb?\xe2cUU\xfd?\xfc\x7fA\xed8\x8e\xe3?\xa5\xaa\xac\x91\xf6\x12\xca?n\x1cs\xb6\xf9a\xb1?\xe8%D\xf3L-\x97?5\xddZD\x11\xe7~?#>\xe7\x82\x0b\x9ad?\xd9R4Y\x0fxK?;7x;\nP2?N\xf4JO\xb8j\x18?4\xf81\x8a%G\x00?\x9a\xf5\x97\r2\xb4\xe5>\xcd\x9c\xca\xbcB\xf0\xcc>3\x13\x87(\xd7J\xb3>\x99\x19\xb4\xe0\x1e\xb9\x99>ff\xcd\x95\x14&\x81>\x88\x88\xbc\xc7p\xddf>`\x0b\xa6_\x96|N>@\xb2n\xea\x0eS4>U\x98\x938i\x19\x1b>\x8eeb\xd0\xf0\x10\x02>\xbd\xdc-k\x96\x16\xe8=(\x93\x1e\xf2\x0e\x0f\xd0=\xe0n\xd3Bii\xb5=*\xe9\x19Y\x8c\x8c\x9c=\xc6\xf0\xbb\x90]\x08\x83=]\x96\xfa\xc0|`i=>d\xfc\xd5\xfd\xeaP=R0\xfb\xc7\xa7\x8e6=\xc2\x95\xf9_\x8a\x13\x1e=\xd6c\xa6\xea\x06\r\x04=r\xda\xdd8\t\xbc\xea<\xf6\xe6\x93\xd0\xb0\xd2\xd1<\x9d\xdeok\x96\xc3\xb7<&~\xea9s\xaf\x9f<UUUUUU\x13@q\x1c\xc7q\x1c\xc7\xf9?\xf6\x12\xdaKh/\xe1?\xf2\xc3"e\xe0\xe9\xc6?\xed\xaf\x831+\x8d\xae?\xf3\x1f\xad\xcb\x1c^\x94?\x15\x1e\xdd\xbd>\xb8\x02@\xc6\xd2&\xfd\xa8\xf5\xe8?\xd9\xe1\x19\xfe\xc5\xa3\xd0?v\x82"\xa8\xb2/\xb6?\x9dX\x835\xee\x94\x9d?h\x90W\xce\x9e\xb8\x83?\x8a\xc0th~Kj?\\\x80\xf8\x9a\xa9\x87Q?%\xab\xa0\xce\x8c_7?1\xe4\x80\x13\x11*\x1f? \x98\x00\r\xb6\xc6\x04?\x80u\xabf\x9d\xb3\xeb>UNrD\xbew\xd2>\x1c\x13C[\xa8\x9f\xb8>\x12b\xd7<pj\xa0>m-\x1fQ@\xe3\x85>\xe6\x91)l\x00/m>Da\xc6\xf2\xaatS>\x05\xd7]\xee\xe3\xf09>' arr = np.frombuffer(data, dtype='<f8') if sys.byteorder != "little": arr = arr.byteswap().newbyteorder() result = mom.rolling_sum(arr, 2) self.assertTrue((result[1:] >= 0).all()) result = mom.rolling_mean(arr, 2) self.assertTrue((result[1:] >= 0).all()) result = mom.rolling_var(arr, 2) self.assertTrue((result[1:] >= 0).all()) # #2527, ugh arr = np.array([0.00012456, 0.0003, 0]) result = mom.rolling_mean(arr, 1) self.assertTrue(result[-1] >= 0) result = mom.rolling_mean(-arr, 1) self.assertTrue(result[-1] <= 0) def _check_moment_func(self, func, static_comp, window=50, has_min_periods=True, has_center=True, has_time_rule=True, preserve_nan=True, fill_value=None): self._check_ndarray(func, static_comp, window=window, has_min_periods=has_min_periods, preserve_nan=preserve_nan, has_center=has_center, fill_value=fill_value) self._check_structures(func, static_comp, has_min_periods=has_min_periods, has_time_rule=has_time_rule, fill_value=fill_value, has_center=has_center) def _check_ndarray(self, func, static_comp, window=50, has_min_periods=True, preserve_nan=True, has_center=True, fill_value=None): result = func(self.arr, window) assert_almost_equal(result[-1], static_comp(self.arr[-50:])) if preserve_nan: assert(np.isnan(result[self._nan_locs]).all()) # excluding NaNs correctly arr = randn(50) arr[:10] = np.NaN arr[-10:] = np.NaN if has_min_periods: result = func(arr, 50, min_periods=30) assert_almost_equal(result[-1], static_comp(arr[10:-10])) # min_periods is working correctly result = func(arr, 20, min_periods=15) self.assert_(np.isnan(result[23])) self.assert_(not np.isnan(result[24])) self.assert_(not np.isnan(result[-6])) self.assert_(np.isnan(result[-5])) arr2 = randn(20) result = func(arr2, 10, min_periods=5) self.assert_(isnull(result[3])) self.assert_(notnull(result[4])) # min_periods=0 result0 = func(arr, 20, min_periods=0) result1 = func(arr, 20, min_periods=1) assert_almost_equal(result0, result1) else: result = func(arr, 50) assert_almost_equal(result[-1], static_comp(arr[10:-10])) if has_center: if has_min_periods: result = func(arr, 20, min_periods=15, center=True) expected = func(arr, 20, min_periods=15) else: result = func(arr, 20, center=True) expected = func(arr, 20) assert_almost_equal(result[1], expected[10]) if fill_value is None: self.assert_(np.isnan(result[-9:]).all()) else: self.assert_((result[-9:] == 0).all()) if has_min_periods: self.assert_(np.isnan(expected[23])) self.assert_(np.isnan(result[14])) self.assert_(np.isnan(expected[-5])) self.assert_(np.isnan(result[-14])) def _check_structures(self, func, static_comp, has_min_periods=True, has_time_rule=True, has_center=True, fill_value=None): series_result = func(self.series, 50) self.assert_(isinstance(series_result, Series)) frame_result = func(self.frame, 50) self.assertEquals(type(frame_result), DataFrame) # check time_rule works if has_time_rule: win = 25 minp = 10 if has_min_periods: series_result = func(self.series[::2], win, min_periods=minp, freq='B') frame_result = func(self.frame[::2], win, min_periods=minp, freq='B') else: series_result = func(self.series[::2], win, freq='B') frame_result = func(self.frame[::2], win, freq='B') last_date = series_result.index[-1] prev_date = last_date - 24 * datetools.bday trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) assert_almost_equal(series_result[-1], static_comp(trunc_series)) assert_almost_equal(frame_result.xs(last_date), trunc_frame.apply(static_comp)) if has_center: if has_min_periods: minp = 10 series_xp = func(self.series, 25, min_periods=minp).shift(-12) frame_xp = func(self.frame, 25, min_periods=minp).shift(-12) series_rs = func(self.series, 25, min_periods=minp, center=True) frame_rs = func(self.frame, 25, min_periods=minp, center=True) else: series_xp = func(self.series, 25).shift(-12) frame_xp = func(self.frame, 25).shift(-12) series_rs = func(self.series, 25, center=True) frame_rs = func(self.frame, 25, center=True) if fill_value is not None: series_xp = series_xp.fillna(fill_value) frame_xp = frame_xp.fillna(fill_value) assert_series_equal(series_xp, series_rs) assert_frame_equal(frame_xp, frame_rs) def test_legacy_time_rule_arg(self): from io import StringIO # suppress deprecation warnings sys.stderr = StringIO() rng = bdate_range('1/1/2000', periods=20) ts = Series(np.random.randn(20), index=rng) ts = ts.take(np.random.permutation(len(ts))[:12]).sort_index() try: result = mom.rolling_mean(ts, 1, min_periods=1, freq='B') expected = mom.rolling_mean(ts, 1, min_periods=1, time_rule='WEEKDAY') tm.assert_series_equal(result, expected) result = mom.ewma(ts, span=5, freq='B') expected = mom.ewma(ts, span=5, time_rule='WEEKDAY') tm.assert_series_equal(result, expected) finally: sys.stderr = sys.__stderr__ def test_ewma(self): self._check_ew(mom.ewma) arr = np.zeros(1000) arr[5] = 1 result = mom.ewma(arr, span=100, adjust=False).sum() self.assert_(np.abs(result - 1) < 1e-2) def test_ewma_nan_handling(self): s = Series([1.] + [np.nan] * 5 + [1.]) result = mom.ewma(s, com=5) assert_almost_equal(result, [1] * len(s)) def test_ewmvar(self): self._check_ew(mom.ewmvar) def test_ewmvol(self): self._check_ew(mom.ewmvol) def test_ewma_span_com_args(self): A = mom.ewma(self.arr, com=9.5) B = mom.ewma(self.arr, span=20) assert_almost_equal(A, B) self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20) self.assertRaises(Exception, mom.ewma, self.arr) def test_ew_empty_arrays(self): arr = np.array([], dtype=np.float64) funcs = [mom.ewma, mom.ewmvol, mom.ewmvar] for f in funcs: result = f(arr, 3) assert_almost_equal(result, arr) def _check_ew(self, func): self._check_ew_ndarray(func) self._check_ew_structures(func) def _check_ew_ndarray(self, func, preserve_nan=False): result = func(self.arr, com=10) if preserve_nan: assert(np.isnan(result[self._nan_locs]).all()) # excluding NaNs correctly arr = randn(50) arr[:10] = np.NaN arr[-10:] = np.NaN # ??? check something # pass in ints result2 = func(np.arange(50), span=10) self.assert_(result2.dtype == np.float_) def _check_ew_structures(self, func): series_result = func(self.series, com=10) self.assert_(isinstance(series_result, Series)) frame_result = func(self.frame, com=10) self.assertEquals(type(frame_result), DataFrame) # binary moments def test_rolling_cov(self): A = self.series B = A + randn(len(A)) result = mom.rolling_cov(A, B, 50, min_periods=25) assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) def test_rolling_corr(self): A = self.series B = A + randn(len(A)) result = mom.rolling_corr(A, B, 50, min_periods=25) assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) # test for correct bias correction a = tm.makeTimeSeries() b = tm.makeTimeSeries() a[:5] = np.nan b[:10] = np.nan result = mom.rolling_corr(a, b, len(a), min_periods=1) assert_almost_equal(result[-1], a.corr(b)) def test_rolling_corr_pairwise(self): panel = mom.rolling_corr_pairwise(self.frame, 10, min_periods=5) correl = panel.ix[:, 1, 5] exp = mom.rolling_corr(self.frame[1], self.frame[5], 10, min_periods=5) tm.assert_series_equal(correl, exp) def test_flex_binary_moment(self): # GH3155 # don't blow the stack self.assertRaises(ValueError, mom._flex_binary_moment,5,6,None) def test_corr_sanity(self): #GH 3155 df = DataFrame( np.array( [[ 0.87024726, 0.18505595], [ 0.64355431, 0.3091617 ], [ 0.92372966, 0.50552513], [ 0.00203756, 0.04520709], [ 0.84780328, 0.33394331], [ 0.78369152, 0.63919667]]) ) res = mom.rolling_corr(df[0],df[1],5,center=True) self.assertTrue(all([np.abs(np.nan_to_num(x)) <=1 for x in res])) # and some fuzzing for i in range(10): df = DataFrame(np.random.rand(30,2)) res = mom.rolling_corr(df[0],df[1],5,center=True) print( res) self.assertTrue(all([np.abs(np.nan_to_num(x)) <=1 for x in res])) def test_flex_binary_frame(self): def _check(method): series = self.frame[1] res = method(series, self.frame, 10) res2 = method(self.frame, series, 10) exp = self.frame.apply(lambda x: method(series, x, 10)) tm.assert_frame_equal(res, exp) tm.assert_frame_equal(res2, exp) frame2 = self.frame.copy() frame2.values[:] = np.random.randn(*frame2.shape) res3 = method(self.frame, frame2, 10) exp = DataFrame(dict((k, method(self.frame[k], frame2[k], 10)) for k in self.frame)) tm.assert_frame_equal(res3, exp) methods = [mom.rolling_corr, mom.rolling_cov] for meth in methods: _check(meth) def test_ewmcov(self): self._check_binary_ew(mom.ewmcov) def test_ewmcorr(self): self._check_binary_ew(mom.ewmcorr) def _check_binary_ew(self, func): A = Series(randn(50), index=np.arange(50)) B = A[2:] + randn(48) A[:10] = np.NaN B[-10:] = np.NaN result = func(A, B, 20, min_periods=5) self.assert_(np.isnan(result.values[:15]).all()) self.assert_(not np.isnan(result.values[15:]).any()) self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5) def test_expanding_apply(self): ser = Series([]) assert_series_equal(ser, mom.expanding_apply(ser, lambda x: x.mean())) def expanding_mean(x, min_periods=1, freq=None): return mom.expanding_apply(x, lambda x: x.mean(), min_periods=min_periods, freq=freq) self._check_expanding(expanding_mean, np.mean) def test_expanding_corr(self): A = self.series.dropna() B = (A + randn(len(A)))[:-5] result = mom.expanding_corr(A, B) rolling_result = mom.rolling_corr(A, B, len(A), min_periods=1) assert_almost_equal(rolling_result, result) def test_expanding_count(self): result = mom.expanding_count(self.series) assert_almost_equal(result, mom.rolling_count(self.series, len(self.series))) def test_expanding_quantile(self): result = mom.expanding_quantile(self.series, 0.5) rolling_result = mom.rolling_quantile(self.series, len(self.series), 0.5, min_periods=1) assert_almost_equal(result, rolling_result) def test_expanding_cov(self): A = self.series B = (A + randn(len(A)))[:-5] result = mom.expanding_cov(A, B) rolling_result = mom.rolling_cov(A, B, len(A), min_periods=1) assert_almost_equal(rolling_result, result) def test_expanding_max(self): self._check_expanding(mom.expanding_max, np.max, preserve_nan=False) def test_expanding_corr_pairwise(self): result = mom.expanding_corr_pairwise(self.frame) rolling_result = mom.rolling_corr_pairwise(self.frame, len(self.frame), min_periods=1) for i in result.items: assert_almost_equal(result[i], rolling_result[i]) def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, has_time_rule=True, preserve_nan=True): result = func(self.arr) assert_almost_equal(result[10], static_comp(self.arr[:11])) if preserve_nan: assert(np.isnan(result[self._nan_locs]).all()) arr = randn(50) if has_min_periods: result = func(arr, min_periods=30) assert(np.isnan(result[:29]).all()) assert_almost_equal(result[-1], static_comp(arr[:50])) # min_periods is working correctly result = func(arr, min_periods=15) self.assert_(np.isnan(result[13])) self.assert_(not np.isnan(result[14])) arr2 = randn(20) result = func(arr2, min_periods=5) self.assert_(isnull(result[3])) self.assert_(notnull(result[4])) # min_periods=0 result0 = func(arr, min_periods=0) result1 = func(arr, min_periods=1) assert_almost_equal(result0, result1) else: result = func(arr) assert_almost_equal(result[-1], static_comp(arr[:50])) def _check_expanding_structures(self, func): series_result = func(self.series) self.assert_(isinstance(series_result, Series)) frame_result = func(self.frame) self.assertEquals(type(frame_result), DataFrame) def _check_expanding(self, func, static_comp, has_min_periods=True, has_time_rule=True, preserve_nan=True): self._check_expanding_ndarray(func, static_comp, has_min_periods=has_min_periods, has_time_rule=has_time_rule, preserve_nan=preserve_nan) self._check_expanding_structures(func)
def test_apply_deprecate_reduce(self): empty_frame = DataFrame() x = [] with tm.assert_produces_warning(FutureWarning): empty_frame.apply(x.append, axis=1, reduce=True)
def evaluate_prediction( prediction: Iterable[str], reference: Iterable[str], ) -> DataFrame: """Calculates F1 Score, Recall and Precision of a :func:`~pypairs.cyclone` prediction. Parameters ---------- prediction List of predicted classes. reference List of actual classes Returns ------- A :class:`~pandas.DataFrame` with columns "f1", "recall", "precision" and "average" for all categories and a overall average containing the respective score. Example ------- To get the prediction quality for the example usecase of :func:`~pypairs.cyclone` run:: from pypairs import pairs, datasets, utils, plotting import numpy as np adata = datasets.leng15('sorted') marker_pairs = datasets.default_cc_marker() scores = pairs.cyclone(adata, marker_pairs) ref_labels = list(np.repeat("G2M", 76)) + list(np.repeat("S", 80)) + list(np.repeat("G1", 91)) prediction_quality = utils.evaluate_prediction(scores['max_class'], ref_labels) print(prediction_quality) """ ref = np.array(reference) pred = np.array(prediction) labels_cats = np.unique(list(ref) + list(pred)) f1 = np.append(f1_score(ref, pred, average=None, labels=labels_cats), f1_score(ref, pred, average='macro', labels=labels_cats)) recall = np.append( recall_score(ref, pred, average=None, labels=labels_cats), recall_score(ref, pred, average='macro', labels=labels_cats)) precision = np.append( precision_score(ref, pred, average=None, labels=labels_cats), precision_score(ref, pred, average='macro', labels=labels_cats)) labels = np.append(labels_cats, "average") df = DataFrame(columns=labels, index=["f1", "recall", "precision"]) df.loc["f1"] = f1 df.loc["recall"] = recall df.loc["precision"] = precision average = np.average(df.values, axis=0) df.loc["average"] = average df = df.apply(pd.to_numeric, errors='coerce') return df.T
def test_int64_overflow_issues(self): # #2690, combinatorial explosion df1 = DataFrame(np.random.randn(1000, 7), columns=list('ABCDEF') + ['G1']) df2 = DataFrame(np.random.randn(1000, 7), columns=list('ABCDEF') + ['G2']) # it works! result = merge(df1, df2, how='outer') assert len(result) == 2000 low, high, n = -1 << 10, 1 << 10, 1 << 20 left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG')) left['left'] = left.sum(axis=1) # one-2-one match i = np.random.permutation(len(left)) right = left.iloc[i].copy() right.columns = right.columns[:-1].tolist() + ['right'] right.index = np.arange(len(right)) right['right'] *= -1 out = merge(left, right, how='outer') assert len(out) == len(left) assert_series_equal(out['left'], -out['right'], check_names=False) result = out.iloc[:, :-2].sum(axis=1) assert_series_equal(out['left'], result, check_names=False) assert result.name is None out.sort_values(out.columns.tolist(), inplace=True) out.index = np.arange(len(out)) for how in ['left', 'right', 'outer', 'inner']: assert_frame_equal(out, merge(left, right, how=how, sort=True)) # check that left merge w/ sort=False maintains left frame order out = merge(left, right, how='left', sort=False) assert_frame_equal(left, out[left.columns.tolist()]) out = merge(right, left, how='left', sort=False) assert_frame_equal(right, out[right.columns.tolist()]) # one-2-many/none match n = 1 << 11 left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'), columns=list('ABCDEFG')) # confirm that this is checking what it is supposed to check shape = left.apply(Series.nunique).values assert is_int64_overflow_possible(shape) # add duplicates to left frame left = concat([left, left], ignore_index=True) right = DataFrame(np.random.randint(low, high, (n // 2, 7)).astype('int64'), columns=list('ABCDEFG')) # add duplicates & overlap with left to the right frame i = np.random.choice(len(left), n) right = concat([right, right, left.iloc[i]], ignore_index=True) left['left'] = np.random.randn(len(left)) right['right'] = np.random.randn(len(right)) # shuffle left & right frames i = np.random.permutation(len(left)) left = left.iloc[i].copy() left.index = np.arange(len(left)) i = np.random.permutation(len(right)) right = right.iloc[i].copy() right.index = np.arange(len(right)) # manually compute outer merge ldict, rdict = defaultdict(list), defaultdict(list) for idx, row in left.set_index(list('ABCDEFG')).iterrows(): ldict[idx].append(row['left']) for idx, row in right.set_index(list('ABCDEFG')).iterrows(): rdict[idx].append(row['right']) vals = [] for k, lval in ldict.items(): rval = rdict.get(k, [np.nan]) for lv, rv in product(lval, rval): vals.append(k + tuple([lv, rv])) for k, rval in rdict.items(): if k not in ldict: for rv in rval: vals.append(k + tuple([np.nan, rv])) def align(df): df = df.sort_values(df.columns.tolist()) df.index = np.arange(len(df)) return df def verify_order(df): kcols = list('ABCDEFG') assert_frame_equal(df[kcols].copy(), df[kcols].sort_values(kcols, kind='mergesort')) out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right']) out = align(out) jmask = { 'left': out['left'].notnull(), 'right': out['right'].notnull(), 'inner': out['left'].notnull() & out['right'].notnull(), 'outer': np.ones(len(out), dtype='bool') } for how in 'left', 'right', 'outer', 'inner': mask = jmask[how] frame = align(out[mask].copy()) assert mask.all() ^ mask.any() or how == 'outer' for sort in [False, True]: res = merge(left, right, how=how, sort=sort) if sort: verify_order(res) # as in GH9092 dtypes break with outer/right join assert_frame_equal(frame, align(res), check_dtype=how not in ('right', 'outer'))
def from_df(cls, df: pd.DataFrame, **kwargs) -> 'ItemList': df['img_data'] = df.apply( lambda row: Image(npimg2tensor(generate_image(row))), axis=1) return cls(items=range(len(df)), inner_df=df.copy(), **kwargs)
def match_evaluations_to_courses( evaluation_narratives: pd.DataFrame, evaluation_ratings: pd.DataFrame, evaluation_statistics: pd.DataFrame, listings: pd.DataFrame, ) -> Tuple[pd.DataFrame, ...]: """ Match evaluations to course IDs. Parameters ---------- evaluation_narratives: DataFrame of narratives. evaluation_ratings: DataFrame of ratings. evaluation_statistics: DataFrame of statistics. listings: Listings DataFrame from import_courses. Returns ------- evaluation_narratives, evaluation_ratings, evaluation_statistics, evaluation_questions """ print("Matching evaluations to courses") # construct outer season grouping season_crn_to_course_id = listings[["season_code", "course_id", "crn"]].groupby("season_code") # construct inner course_code to course_id mapping season_crn_to_course_id = season_crn_to_course_id.apply( # type: ignore lambda x: x[["crn", "course_id"]].set_index("crn")["course_id"]. to_dict()) # cast outer season mapping to dictionary season_crn_to_course_id = season_crn_to_course_id.to_dict() # type: ignore def get_course_id(row): course_id = season_crn_to_course_id.get(row["season"], {}).get(row["crn"], None) return course_id # get course IDs evaluation_narratives["course_id"] = evaluation_narratives.apply( get_course_id, axis=1) evaluation_ratings["course_id"] = evaluation_ratings.apply(get_course_id, axis=1) evaluation_statistics["course_id"] = evaluation_statistics.apply( get_course_id, axis=1) # each course must have exactly one statistic, so use this for reporting nan_total = evaluation_statistics["course_id"].isna().sum() print( f"Removing {nan_total}/{len(evaluation_statistics)} evaluated courses without matches" ) # remove unmatched courses evaluation_narratives.dropna(subset=["course_id"], axis=0, inplace=True) evaluation_ratings.dropna(subset=["course_id"], axis=0, inplace=True) evaluation_statistics.dropna(subset=["course_id"], axis=0, inplace=True) # change from float to integer type for import evaluation_narratives["course_id"] = evaluation_narratives[ "course_id"].astype(int) evaluation_ratings["course_id"] = evaluation_ratings["course_id"].astype( int) evaluation_statistics["course_id"] = evaluation_statistics[ "course_id"].astype(int) # drop cross-listing duplicates evaluation_statistics.drop_duplicates( # type: ignore subset=["course_id"], inplace=True, keep="first") evaluation_ratings.drop_duplicates( # type: ignore subset=["course_id", "question_code"], inplace=True, keep="first") evaluation_narratives.drop_duplicates( # type: ignore subset=["course_id", "question_code", "comment"], inplace=True, keep="first") return evaluation_statistics, evaluation_narratives, evaluation_ratings
class Apply: def setup(self): self.df = DataFrame(np.random.randn(1000, 100)) self.s = Series(np.arange(1028.0)) self.df2 = DataFrame({i: self.s for i in range(1028)}) self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC")) def time_apply_user_func(self): self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)]) def time_apply_axis_1(self): self.df.apply(lambda x: x + 1, axis=1) def time_apply_lambda_mean(self): self.df.apply(lambda x: x.mean()) def time_apply_np_mean(self): self.df.apply(np.mean) def time_apply_pass_thru(self): self.df.apply(lambda x: x) def time_apply_ref_by_name(self): self.df3.apply(lambda x: x["A"] + x["B"], axis=1)
def resolve_cross_listings(merged_course_info: pd.DataFrame) -> pd.DataFrame: """ Resolve course cross-listings by computing unique course_ids. Parameters ---------- merged_course_info: Raw course information from JSON files. Returns ------- merged_course_info with 'temp_course_id' field added. """ # seasons must be sorted in ascending order # prioritize Yale College courses when deduplicating listings print("Sorting by season and if-undergrad") def classify_yc(row): if row["school"] == "YC": return True if row["school"] != row["school"]: # check number of numbers in course number # (some courses have letters in them) num_nums = len([x for x in row["number"] if x.isnumeric()]) # if the course number is in the 000s to 400s range it's undergrad if row["number"][0] in ["0", "1", "2", "3", "4"] and num_nums < 4: return True return False merged_course_info["is_yc"] = merged_course_info.apply(classify_yc, axis=1) merged_course_info = merged_course_info.sort_values( by=["season_code", "is_yc"], ascending=[True, False]) print("Aggregating cross-listings") merged_course_info["season_code"] = merged_course_info[ "season_code"].astype(int) merged_course_info["crn"] = merged_course_info["crn"].astype(int) merged_course_info["crns"] = merged_course_info["crns"].apply( lambda crns: [int(crn) for crn in crns]) # group CRNs by season for cross-listing deduplication crns_by_season = merged_course_info.groupby("season_code")[ # type: ignore "crns"].apply(list) # convert CRN groups to sets for resolution crns_by_season = crns_by_season.apply(lambda x: [frozenset(y) for y in x]) # resolve overlapping CRN sets crns_by_season = crns_by_season.apply(merge_overlapping) print("Mapping out cross-listings") # map CRN groups to temporary IDs within each season temp_course_ids_by_season = crns_by_season.apply( lambda x: invert_dict_of_lists(dict(enumerate(x)))) temp_course_ids_by_season = temp_course_ids_by_season.to_dict() # assign season-specific ID based on CRN group IDs merged_course_info["season_course_id"] = merged_course_info.apply( lambda row: temp_course_ids_by_season[row["season_code"]][row["crn"]], axis=1) # temporary string-based unique course identifier merged_course_info["temp_course_id"] = merged_course_info.apply( lambda x: f"{x['season_code']}_{x['season_course_id']}", axis=1) return merged_course_info
Chinese English Math name guan 132.0 65 30 XIAOGUAN zhang 190.0 85 98 XIAOZHANG zhao 186.0 92 96 XIAOZHAO ma 180.0 88 77 XIAOMA huang NaN 90 90 XIAOHUANG ''' ## 使用更加复杂的函数 def plus(df, n, m): df['new1'] = (df['Chinese'] + df['English']) * m df['new2'] = (df['Chinese'] + df['English']) * n return df print '\n' df1 = data_frame2.apply(plus, axis=1, args=( 2, 3, )) print df1 '''output Chinese English Math name new1 new2 guan 132.0 65 30 XIAOGUAN 591.0 394.0 zhang 190.0 85 98 XIAOZHANG 825.0 550.0 zhao 186.0 92 96 XIAOZHAO 834.0 556.0 ma 180.0 88 77 XIAOMA 804.0 536.0 huang NaN 90 90 XIAOHUANG NaN NaN '''
class Scores(object): """ Parameters ---------- uri : str, optional modality : str, optional Returns ------- scores : `Scores` Examples -------- >>> s = Scores(uri='video', modality='speaker') >>> s[Segment(0,1), 's1', 'A'] = 0.1 >>> s[Segment(0,1), 's1', 'B'] = 0.2 >>> s[Segment(0,1), 's1', 'C'] = 0.3 >>> s[Segment(0,1), 's2', 'A'] = 0.4 >>> s[Segment(0,1), 's2', 'B'] = 0.3 >>> s[Segment(0,1), 's2', 'C'] = 0.2 >>> s[Segment(2,3), 's1', 'A'] = 0.2 >>> s[Segment(2,3), 's1', 'B'] = 0.1 >>> s[Segment(2,3), 's1', 'C'] = 0.3 """ @classmethod def from_df(cls, df, uri=None, modality=None, aggfunc=np.mean): """ Parameters ---------- df : DataFrame Must contain the following columns: 'segment', 'track', 'label' and 'value' uri : str, optional Resource identifier modality : str, optional Modality aggfunc : func Value aggregation function in case of duplicate (segment, track, label) tuples Returns ------- """ dataframe = pivot_table(df, values=PYANNOTE_SCORE, index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL, aggfunc=aggfunc) annotation = Annotation(uri=uri, modality=modality) for index, _ in dataframe.iterrows(): segment = Segment(*index[0]) track = index[1] annotation[segment, track] = '' labels = dataframe.columns return cls(uri=uri, modality=modality, annotation=annotation, labels=labels, values=dataframe.values) def __init__(self, uri=None, modality=None, annotation=None, labels=None, values=None, dtype=None): super(Scores, self).__init__() names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] if annotation: annotation = annotation.copy() index = Index([s + (t, ) for s, t in annotation.itertracks()], name=names) else: annotation = Annotation(uri=uri, modality=modality) index = MultiIndex(levels=[list() for name in names], labels=[list() for name in names], names=names) self.annotation_ = annotation columns = None if labels is None else list(labels) data = None if values is None else np.array(values) dtype = np.float if values is None else values.dtype self.dataframe_ = DataFrame(data=data, dtype=dtype, index=index, columns=columns) self.hasChanged_ = True self.modality = modality self.uri = uri def copy(self): self._reindexIfNeeded() copied = self.__class__(uri=self.uri, modality=self.modality) copied.dataframe_ = self.dataframe_.copy() copied.annotation_ = self.annotation_.copy() copied.hasChanged_ = self.hasChanged_ return copied # del scores[segment] # del scores[segment, :] # del scores[segment, track] def __delitem__(self, key): if isinstance(key, Segment): segment = key self.dataframe_.drop(tuple(segment), axis=0, inplace=True) del self.annotation_[segment] self.hasChanged_ = True elif isinstance(key, tuple) and len(key) == 2: segment, track = key self.dataframe_.drop(tuple(segment) + (track, ), axis=0, inplace=True) del self.annotation_[segment, track] self.hasChanged_ = True else: raise KeyError('') # value = scores[segment, track, label] def __getitem__(self, key): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key return self.dataframe_.at[tuple(segment) + (track, ), label] # scores[segment, track, label] = value # scores[segment, label] ==== scores[segment, '_', label] def __setitem__(self, key, value): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key # do not add empty track if not segment: return self.dataframe_.at[tuple(segment) + (track, ), label] = value self.annotation_[segment, track] = label self.hasChanged_ = True def __len__(self): """Number of annotated segments""" return len(self.annotation_) def __nonzero__(self): return self.__bool__() def __bool__(self): """False if annotation is empty""" return True if self.annotation_ else False def __contains__(self, included): """Check if segments are annotated Parameters ---------- included : `Segment` or `Timeline` Returns ------- contains : bool True if every segment in `included` is annotated, False otherwise. """ return included in self.annotation_ def __iter__(self): """Iterate over sorted segments""" return iter(self.annotation_.get_timeline(copy=False)) def __reversed__(self): """Reverse iterate over sorted segments""" return reversed(self.annotation_.get_timeline(copy=False)) def itersegments(self): return iter(self) def tracks(self, segment): """Set of tracks for query segment Parameters ---------- segment : `Segment` Query segment Returns ------- tracks : set Set of tracks for query segment """ return self.annotation_.get_tracks(segment) def has_track(self, segment, track): """Check whether a given track exists Parameters ---------- segment : `Segment` Query segment track : Query track Returns ------- exists : bool True if track exists for segment """ return self.annotation_.has_track(segment, track) def get_track_by_name(self, track): """Get all tracks with given name Parameters ---------- track : any valid track name Requested name track Returns ------- tracks : list List of (segment, track) tuples """ return self.annotation_.get_track_by_name(track) def new_track(self, segment, candidate=None, prefix=None): """Track name generator Parameters ---------- segment : Segment prefix : str, optional candidate : any valid track name Returns ------- track : str New track name """ return self.annotation_.new_track(segment, candidate=None, prefix=None) def itertracks(self): """Iterate over annotation as (segment, track) tuple""" return self.annotation_.itertracks() def itervalues(self): """Iterate over scores as (segment, track, label, value) tuple""" # make sure segment/track pairs are sorted self._reindexIfNeeded() labels = self.labels() # yield one (segment, track, label) tuple per loop for index, columns in self.dataframe_.iterrows(): segment = Segment(*index[:-1]) track = index[-1] for label in labels: value = columns[label] if not np.isnan(value): yield segment, track, label, value def get_track_scores(self, segment, track): """Get all scores for a given track. Parameters ---------- segment : Segment track : hashable segment, track must be a valid track Returns ------- scores : dict {label: score} dictionary """ return dict(self.dataframe_.xs(tuple(segment) + (track, ))) def labels(self): """List of labels Returns ------- labels : list Sorted list of existing labels Remarks ------- Labels are sorted based on their string representation. """ return sorted(self.dataframe_.columns, key=str) def _reindexIfNeeded(self): if not self.hasChanged_: return names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index( [s + (t, ) for s, t in self.annotation_.itertracks()], name=names) self.dataframe_ = self.dataframe_.reindex(new_index) self.hasChanged_ = False return def retrack(self): """ """ self._reindexIfNeeded() retracked = self.copy() annotation = self.annotation_.retrack() retracked.annotation_ = annotation names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index([s + (t, ) for s, t in annotation.itertracks()], name=names) retracked.dataframe_.index = new_index return retracked def apply(self, func, axis=0): applied = self.copy() applied.dataframe_ = self.dataframe_.apply(func, axis=axis) applied.hasChanged_ = True return applied def rank(self, ascending=False): """ Parameters ---------- ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- rank : `Scores` """ ranked = self.copy() ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) ranked.hasChanged_ = True return ranked def nbest(self, n, ascending=False): """ Parameters ---------- n : int Size of n-best list ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- nbest : `Scores` New scores where only n-best are kept. """ filtered = self.copy() ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n, other=np.NaN) filtered.hasChanged_ = True return filtered def subset(self, labels, invert=False): """Scores subset Extract scores subset based on labels Parameters ---------- labels : set Set of labels invert : bool, optional If invert is True, extract all but requested `labels` Returns ------- subset : `Scores` Scores subset. """ self._reindexIfNeeded() if not isinstance(labels, set): raise TypeError('labels must be provided as a set of labels.') if invert: labels = set(self.labels()) - labels else: labels = labels & set(self.labels()) subset = Scores(uri=self.uri, modality=self.modality) subset.annotation_ = self.annotation_ subset.dataframe_ = self.dataframe_[list(labels)] return subset def to_annotation(self, threshold=-np.inf, posterior=False): """ Parameters ---------- threshold : float, optional Each track is annotated with the label with the highest score. Yet, if the latter is smaller than `threshold`, label is replaced with an `Unknown` instance. posterior : bool, optional If True, scores are posterior probabilities in open-set identification. If top model posterior is higher than unknown posterior, it is selected. Otherwise, label is replaced with an `Unknown` instance. """ if not self: return Annotation(uri=self.uri, modality=self.modality) best = self.nbest(1, ascending=False) large_enough = best.copy() if posterior: unknown_posterior = 1. - self.dataframe_.sum(axis=1) large_enough.dataframe_ = (((best.dataframe_.T > unknown_posterior) & (best.dataframe_.T > threshold)).T) else: large_enough.dataframe_ = ((best.dataframe_.T > threshold).T) large_enough.dataframe_.where(best.dataframe_.notnull(), inplace=True, other=np.NaN) annotation = Annotation(uri=self.uri, modality=self.modality) for segment, track, label, value in large_enough.itervalues(): label = label if value else Unknown() annotation[segment, track] = label return annotation def map(self, func): """Apply function to all values""" mapped = self.copy() mapped.dataframe_ = self.dataframe_.applymap(func) mapped.hasChanged_ = True return mapped def crop(self, focus, mode='strict'): """Crop on focus Parameters ---------- focus : `Segment` or `Timeline` mode : {'strict', 'loose', 'intersection'} In 'strict' mode, only segments fully included in focus coverage are kept. In 'loose' mode, any intersecting segment is kept unchanged. In 'intersection' mode, only intersecting segments are kept and replaced by their actual intersection with the focus. Returns ------- cropped : same type as caller Cropped version of the caller containing only tracks matching the provided focus and mode. Remarks ------- In 'intersection' mode, the best is done to keep the track names unchanged. However, in some cases where two original segments are cropped into the same resulting segments, conflicting track names are modified to make sure no track is lost. """ if isinstance(focus, Segment): return self.crop(Timeline([focus], uri=self.uri), mode=mode) self._reindexIfNeeded() cropped = self.copy() if mode in ['strict', 'loose']: new_annotation = self.annotation_.crop(focus, mode=mode) keep = [ new_annotation.has_track(segment, track) for segment, track in self.itertracks() ] cropped.dataframe_ = self.dataframe_[keep] cropped.annotation_ = new_annotation cropped.hasChanged_ = True return cropped elif mode in ['intersection']: raise NotImplementedError('') # # two original segments might be cropped into the same resulting # # segment -- therefore, we keep track of the mapping # intersection, mapping = timeline.crop(coverage, # mode=mode, mapping=True) # # # create new empty annotation # A = self.__class__(uri=self.uri, modality=self.modality) # # for cropped in intersection: # for original in mapping[cropped]: # for track in self.tracks(original): # # try to use original track name (candidate) # # if it already exists, create a brand new one # new_track = A.new_track(cropped, candidate=track) # # copy each value, column by column # for label in self.dataframe_.columns: # value = self.dataframe_.get_value((original, track), # label) # A.dataframe_ = A.dataframe_.set_value((cropped, new_track), # label, value) # # return A def __str__(self): """Human-friendly representation""" if self: self._reindexIfNeeded() return str(self.dataframe_) else: return "" def _repr_png_(self): from .notebook import repr_scores return repr_scores(self)
def get_slope(X: pd.DataFrame) -> float: lm = LinearRegression() lm.fit(np.arange(X.shape[0]).reshape(-1,1), X.apply(lambda x: math.log(x) if x != 0 else x)) return lm.coef_[0]
def test_unstack_nan_index(self): # GH7466 cast = lambda val: '{0:1}'.format('' if val != val else val) nan = np.nan def verify(df): mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] rows, cols = df.notna().values.nonzero() for i, j in zip(rows, cols): left = sorted(df.iloc[i, j].split('.')) right = mk_list(df.index[i]) + mk_list(df.columns[j]) right = sorted(list(map(cast, right))) assert left == right df = DataFrame({ 'jim': ['a', 'b', nan, 'd'], 'joe': ['w', 'x', 'y', 'z'], 'jolie': ['a.w', 'b.x', ' .y', 'd.z'] }) left = df.set_index(['jim', 'joe']).unstack()['jolie'] right = df.set_index(['joe', 'jim']).unstack()['jolie'].T assert_frame_equal(left, right) for idx in itertools.permutations(df.columns[:2]): mi = df.set_index(list(idx)) for lev in range(2): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == len(df) verify(udf['jolie']) df = DataFrame({ '1st': ['d'] * 3 + [nan] * 5 + ['a'] * 2 + ['c'] * 3 + ['e'] * 2 + ['b'] * 5, '2nd': ['y'] * 2 + ['w'] * 3 + [nan] * 3 + ['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2, '3rd': [ 67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59, 50, 62, 59, 76, 52, 14, 53, 60, 51 ] }) df['4th'], df['5th'] = \ df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \ df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1) for idx in itertools.permutations(['1st', '2nd', '3rd']): mi = df.set_index(list(idx)) for lev in range(3): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == 2 * len(df) for col in ['4th', '5th']: verify(udf[col]) # GH7403 df = pd.DataFrame({ 'A': list('aaaabbbb'), 'B': range(8), 'C': range(8) }) df.iloc[3, 1] = np.NaN left = df.set_index(['A', 'B']).unstack(0) vals = [[3, 0, 1, 2, nan, nan, nan, nan], [nan, nan, nan, nan, 4, 5, 6, 7]] vals = list(map(list, zip(*vals))) idx = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B') cols = MultiIndex(levels=[['C'], ['a', 'b']], labels=[[0, 0], [0, 1]], names=[None, 'A']) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) df = DataFrame({ 'A': list('aaaabbbb'), 'B': list(range(4)) * 2, 'C': range(8) }) df.iloc[2, 1] = np.NaN left = df.set_index(['A', 'B']).unstack(0) vals = [[2, nan], [0, 4], [1, 5], [nan, 6], [3, 7]] cols = MultiIndex(levels=[['C'], ['a', 'b']], labels=[[0, 0], [0, 1]], names=[None, 'A']) idx = Index([nan, 0, 1, 2, 3], name='B') right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) df = pd.DataFrame({ 'A': list('aaaabbbb'), 'B': list(range(4)) * 2, 'C': range(8) }) df.iloc[3, 1] = np.NaN left = df.set_index(['A', 'B']).unstack(0) vals = [[3, nan], [0, 4], [1, 5], [2, 6], [nan, 7]] cols = MultiIndex(levels=[['C'], ['a', 'b']], labels=[[0, 0], [0, 1]], names=[None, 'A']) idx = Index([nan, 0, 1, 2, 3], name='B') right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) # GH7401 df = pd.DataFrame({ 'A': list('aaaaabbbbb'), 'B': (date_range('2012-01-01', periods=5).tolist() * 2), 'C': np.arange(10) }) df.iloc[3, 1] = np.NaN left = df.set_index(['A', 'B']).unstack() vals = np.array([[3, 0, 1, 2, nan, 4], [nan, 5, 6, 7, 8, 9]]) idx = Index(['a', 'b'], name='A') cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)], labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], names=[None, 'B']) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) # GH4862 vals = [['Hg', nan, nan, 680585148], ['U', 0.0, nan, 680585148], ['Pb', 7.07e-06, nan, 680585148], ['Sn', 2.3614e-05, 0.0133, 680607017], ['Ag', 0.0, 0.0133, 680607017], ['Hg', -0.00015, 0.0133, 680607017]] df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'], index=[17263, 17264, 17265, 17266, 17267, 17268]) left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack() vals = [[nan, nan, 7.07e-06, nan, 0.0], [0.0, -0.00015, nan, 2.3614e-05, nan]] idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]], labels=[[0, 1], [-1, 0]], names=['s_id', 'dosage']) cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']], labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], names=[None, 'agent']) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent']) assert_frame_equal(left.unstack(), right) # GH9497 - multiple unstack with nulls df = DataFrame({ '1st': [1, 2, 1, 2, 1, 2], '2nd': pd.date_range('2014-02-01', periods=6, freq='D'), 'jim': 100 + np.arange(6), 'joe': (np.random.randn(6) * 10).round(2) }) df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02') df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd']) assert left.notna().values.sum() == 2 * len(df) for col in ['jim', 'joe']: for _, r in df.iterrows(): key = r['1st'], (col, r['2nd'], r['3rd']) assert r[col] == left.loc[key]
class TestMoments(unittest.TestCase): _nan_locs = np.arange(20, 40) _inf_locs = np.array([]) def setUp(self): arr = randn(N) arr[self._nan_locs] = np.NaN self.arr = arr self.rng = bdate_range(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) def test_rolling_sum(self): self._check_moment_func(mom.rolling_sum, np.sum) def test_rolling_count(self): counter = lambda x: np.isfinite(x).astype(float).sum() self._check_moment_func(mom.rolling_count, counter, has_min_periods=False, preserve_nan=False) def test_rolling_mean(self): self._check_moment_func(mom.rolling_mean, np.mean) def test_rolling_median(self): self._check_moment_func(mom.rolling_median, np.median) def test_rolling_min(self): self._check_moment_func(mom.rolling_min, np.min) def test_rolling_max(self): self._check_moment_func(mom.rolling_max, np.max) def test_rolling_quantile(self): qs = [.1, .5, .9] def scoreatpercentile(a, per): values = np.sort(a, axis=0) idx = per / 1. * (values.shape[0] - 1) return values[int(idx)] for q in qs: def f(x, window, min_periods=None, freq=None): return mom.rolling_quantile(x, window, q, min_periods=min_periods, freq=freq) def alt(x): return scoreatpercentile(x, q) self._check_moment_func(f, alt) def test_rolling_apply(self): ser = Series([]) assert_series_equal(ser, mom.rolling_apply(ser, 10, lambda x: x.mean())) def roll_mean(x, window, min_periods=None, freq=None): return mom.rolling_apply(x, window, lambda x: x[np.isfinite(x)].mean(), min_periods=min_periods, freq=freq) self._check_moment_func(roll_mean, np.mean) def test_rolling_std(self): self._check_moment_func(mom.rolling_std, lambda x: np.std(x, ddof=1)) self._check_moment_func(functools.partial(mom.rolling_std, ddof=0), lambda x: np.std(x, ddof=0)) def test_rolling_var(self): self._check_moment_func(mom.rolling_var, lambda x: np.var(x, ddof=1)) self._check_moment_func(functools.partial(mom.rolling_var, ddof=0), lambda x: np.var(x, ddof=0)) def test_rolling_skew(self): try: from scipy.stats import skew except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_skew, lambda x: skew(x, bias=False)) def test_rolling_kurt(self): try: from scipy.stats import kurtosis except ImportError: raise nose.SkipTest('no scipy') self._check_moment_func(mom.rolling_kurt, lambda x: kurtosis(x, bias=False)) def _check_moment_func(self, func, static_comp, window=50, has_min_periods=True, has_time_rule=True, preserve_nan=True): self._check_ndarray(func, static_comp, window=window, has_min_periods=has_min_periods, preserve_nan=preserve_nan) self._check_structures(func, static_comp, has_min_periods=has_min_periods, has_time_rule=has_time_rule) def _check_ndarray(self, func, static_comp, window=50, has_min_periods=True, preserve_nan=True): result = func(self.arr, window) assert_almost_equal(result[-1], static_comp(self.arr[-50:])) if preserve_nan: assert (np.isnan(result[self._nan_locs]).all()) # excluding NaNs correctly arr = randn(50) arr[:10] = np.NaN arr[-10:] = np.NaN if has_min_periods: result = func(arr, 50, min_periods=30) assert_almost_equal(result[-1], static_comp(arr[10:-10])) # min_periods is working correctly result = func(arr, 20, min_periods=15) self.assert_(np.isnan(result[23])) self.assert_(not np.isnan(result[24])) self.assert_(not np.isnan(result[-6])) self.assert_(np.isnan(result[-5])) # min_periods=0 result0 = func(arr, 20, min_periods=0) result1 = func(arr, 20, min_periods=1) assert_almost_equal(result0, result1) else: result = func(arr, 50) assert_almost_equal(result[-1], static_comp(arr[10:-10])) def _check_structures(self, func, static_comp, has_min_periods=True, has_time_rule=True): series_result = func(self.series, 50) self.assert_(isinstance(series_result, Series)) frame_result = func(self.frame, 50) self.assertEquals(type(frame_result), DataFrame) # check time_rule works if has_time_rule: win = 25 minp = 10 if has_min_periods: series_result = func(self.series[::2], win, min_periods=minp, freq='B') frame_result = func(self.frame[::2], win, min_periods=minp, freq='B') else: series_result = func(self.series[::2], win, freq='B') frame_result = func(self.frame[::2], win, freq='B') last_date = series_result.index[-1] prev_date = last_date - 24 * datetools.bday trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) assert_almost_equal(series_result[-1], static_comp(trunc_series)) assert_almost_equal(frame_result.xs(last_date), trunc_frame.apply(static_comp)) def test_legacy_time_rule_arg(self): from StringIO import StringIO # suppress deprecation warnings sys.stderr = StringIO() rng = bdate_range('1/1/2000', periods=20) ts = Series(np.random.randn(20), index=rng) ts = ts.take(np.random.permutation(len(ts))[:12]).sort_index() try: result = mom.rolling_mean(ts, 1, min_periods=1, freq='B') expected = mom.rolling_mean(ts, 1, min_periods=1, time_rule='WEEKDAY') tm.assert_series_equal(result, expected) result = mom.ewma(ts, span=5, freq='B') expected = mom.ewma(ts, span=5, time_rule='WEEKDAY') tm.assert_series_equal(result, expected) finally: sys.stderr = sys.__stderr__ def test_ewma(self): self._check_ew(mom.ewma) def test_ewmvar(self): self._check_ew(mom.ewmvar) def test_ewmvol(self): self._check_ew(mom.ewmvol) def test_ewma_span_com_args(self): A = mom.ewma(self.arr, com=9.5) B = mom.ewma(self.arr, span=20) assert_almost_equal(A, B) self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20) self.assertRaises(Exception, mom.ewma, self.arr) def _check_ew(self, func): self._check_ew_ndarray(func) self._check_ew_structures(func) def _check_ew_ndarray(self, func, preserve_nan=False): result = func(self.arr, com=10) if preserve_nan: assert (np.isnan(result[self._nan_locs]).all()) # excluding NaNs correctly arr = randn(50) arr[:10] = np.NaN arr[-10:] = np.NaN # ??? check something # pass in ints result2 = func(np.arange(50), span=10) self.assert_(result2.dtype == np.float_) def _check_ew_structures(self, func): series_result = func(self.series, com=10) self.assert_(isinstance(series_result, Series)) frame_result = func(self.frame, com=10) self.assertEquals(type(frame_result), DataFrame) # binary moments def test_rolling_cov(self): A = self.series B = A + randn(len(A)) result = mom.rolling_cov(A, B, 50, min_periods=25) assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) def test_rolling_corr(self): A = self.series B = A + randn(len(A)) result = mom.rolling_corr(A, B, 50, min_periods=25) assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) # test for correct bias correction a = tm.makeTimeSeries() b = tm.makeTimeSeries() a[:5] = np.nan b[:10] = np.nan result = mom.rolling_corr(a, b, len(a), min_periods=1) assert_almost_equal(result[-1], a.corr(b)) def test_rolling_corr_pairwise(self): panel = mom.rolling_corr_pairwise(self.frame, 10, min_periods=5) correl = panel.ix[:, 1, 5] exp = mom.rolling_corr(self.frame[1], self.frame[5], 10, min_periods=5) tm.assert_series_equal(correl, exp) def test_flex_binary_frame(self): def _check(method): series = self.frame[1] res = method(series, self.frame, 10) res2 = method(self.frame, series, 10) exp = self.frame.apply(lambda x: method(series, x, 10)) tm.assert_frame_equal(res, exp) tm.assert_frame_equal(res2, exp) frame2 = self.frame.copy() frame2.values[:] = np.random.randn(*frame2.shape) res3 = method(self.frame, frame2, 10) exp = DataFrame( dict((k, method(self.frame[k], frame2[k], 10)) for k in self.frame)) tm.assert_frame_equal(res3, exp) methods = [mom.rolling_corr, mom.rolling_cov] for meth in methods: _check(meth) def test_ewmcov(self): self._check_binary_ew(mom.ewmcov) def test_ewmcorr(self): self._check_binary_ew(mom.ewmcorr) def _check_binary_ew(self, func): A = Series(randn(50), index=np.arange(50)) B = A[2:] + randn(48) A[:10] = np.NaN B[-10:] = np.NaN result = func(A, B, 20, min_periods=5) self.assert_(np.isnan(result.values[:15]).all()) self.assert_(not np.isnan(result.values[15:]).any()) self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5)
series3 = frame['d'] series3 frame frame.sub(series3, axis=0) #沿着列一直向右广播,即每列减去相应值 #numpy的ufunc(元素级数组方法)也可用于操作pandas对象 frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) frame np.abs(frame) #通过dataframe的apply方法,将函数应用到各列或各行所形成的一维数组上(序列级) f = lambda x: x.max() - x.min() #定义匿名函数f,返回极差 frame.apply(f) #默认axis=0,应用到列上求每列的极差 frame.apply(f, axis=1) #应用到行上,求每行的极差 #许多常见的数组统计功能都被实现成了dataframe的方法(如sum,mean),则无需使用apply方法 #除标量外,apply还可根据传递的函数性质返回多个值组成的series def f(x): return Series([x.min(), x.max()], index=['min', 'max']) frame.apply(f) #返回每列中最小值和最大值组成的series #python的元素级函数也可应用于dataframe,此时需使用applymap函数 format = lambda x: '%.2f' % x #定义元素的格式匿名函数f frame.applymap(format) #series也有同样的应用于元素级的函数map frame['e'].map(format)
def brand_fill(df: pd.DataFrame): # fill brand global regex regex = re.compile("(%s)" % "|".join(map(re.escape, name_dict.keys()))) # _save('Data/brand_dict_name_dict_regex', [brand_dict, name_dict, regex]) return df.apply(lambda x: brand_check(x.values[0], x.values[1]), axis=1)
def test_apply_multi_index(self): s = DataFrame([[1, 2], [3, 4], [5, 6]]) s.index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) s.columns = ['col1', 'col2'] res = s.apply(lambda x: Series({'min': min(x), 'max': max(x)}), 1) assert isinstance(res.index, MultiIndex)
def flows(futures, start=None, end=None, var=None, roll=None): position = futures.position market = futures.market # market1 = futures.p market = DataFrame( list(market.find({ 'date': { '$gte': start }, 'variety': var }))) position = DataFrame( list(position.find({ 'date': { '$gte': start }, 'variety': var }))).drop_duplicates(['date', 'variety', 'symbol', 'long_party_name'], 'last') # position = position[['date','varie']] # position = position[position['long_party_name'].notna()] # 持仓 # 所有会员 party_name = position[position['date'] == end] long_party_name = party_name['long_party_name'] short_party_name = party_name['short_party_name'] party_name = long_party_name.append( short_party_name).dropna().drop_duplicates() # 多空变化量求和 long = position.groupby(['date', 'variety', 'long_party_name' ])[['long_openIntr', 'long_openIntr_chg']].sum() # print(long) short = position.groupby(['date', 'variety', 'short_party_name' ])[['short_openIntr', 'short_openIntr_chg']].sum() # # 合并 frames = [long, short] position = pd.concat(frames, axis=1, sort=True).fillna(0).reset_index() # 字段更名 position = position.rename(columns={ 'level_0': 'date', 'level_1': 'variety', 'level_2': 'BrokerID' }) # ##行情 market = market.copy() # 指数收盘 market['cv'] = market.apply(lambda x: x['close'] * x['open_interest'], axis=1) closes = market.groupby(['date', 'variety'])[['cv', 'open_interest']].sum() closes['close_index'] = closes['cv'] / closes['open_interest'] # #指数开盘 market['ov'] = market.apply(lambda x: x['open'] * x['open_interest'], axis=1) opens = market.groupby(['date', 'variety'])[['ov', 'open_interest']].sum() closes['open_index'] = opens['ov'] / opens['open_interest'] # 价格变化量 closes['change_index'] = closes.apply( lambda x: x['close_index'] - x['open_index'], axis=1) closes = closes.reset_index() chg = closes[['date', 'variety', 'close_index', 'change_index']] # print(chg['change_index']) # print(merge) df = pd.DataFrame() for i in party_name: try: chg = chg.copy() # print(chg) chg['BrokerID'] = i position1 = position[position['BrokerID'] == i] # 两表合并 mem = pd.merge(chg, position1, on=['date', 'variety', 'BrokerID'], how='left').fillna(0) # mem = merge[merge['BrokerID'] == i] # print(mem) mem = mem.copy() mem['today_net'] = mem.apply( lambda x: x['long_openIntr'] - x['short_openIntr'], axis=1) mem['yesterday_net'] = mem.groupby(['variety', 'BrokerID' ])['today_net'].shift(1) mem['tomorrow_chg'] = mem.groupby(['variety', 'BrokerID' ])['change_index'].shift(-1) mem['net_chg'] = mem.apply( lambda x: x['today_net'] - x['yesterday_net'], axis=1) # mem['count'] = mem['net_chg'].count() # mem = mem.rename(columns={'long_open_interest': 'long_openIntr', 'long_open_interest_chg': 'long_openIntr_chg', 'short_open_interest': 'short_openIntr','short_open_interest_chg': 'short_openIntr_chg'}) # mem['change'] = mem.groupby(['variety', 'BrokerID'])['close_index'].shif(1) mem['change'] = mem['close_index'] - mem['close_index'].shift(1) # 时间窗口相关系数 # mem['corr'] = mem['net_chg'].rolling(window=240).corr(mem['change_index']) # mem['corr2'] = mem['net_chg'].rolling(window=240).corr(mem['tomorrow_chg']).shift(1) # mem['corr3'] = mem['today_net'].rolling(window=240).corr(mem['change']) # mem['lot'] = 0 # mem = mem.copy() mem['lot'] = mem.apply(lambda x: 0 if x['today_net'] == 0 else 1 if x['today_net'] > 0 else -1, axis=1) mem['lot'] = mem['lot'].shift(1).fillna(0) mem['pnl'] = mem['change'] * mem['lot'] # mem['fee']=0 # mem['fee'][mem['lot'] != mem['lot'].shif(1)] = mem['close_index'] * 2*1 mem['netpnl'] = mem['pnl'] mem['cumpnl'] = mem['netpnl'].rolling(roll).sum() # mem['date'] = pd.to_datetime(mem['date']) # #画图 # mem = mem.set_index('date') # with pd.plotting.plot_params.use('x_compat', True): # 方法一 # mem[['cumpnl']].plot(color='r',title=mem[u'BrokerID'][0]+" "+var+' '+end) # mem['today_net'].plot(secondary_y=['today_net']) # plt.ylabel('净持仓') # plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 # plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 # plt.show() # plt.plot(mem['cumpnl']) # print(mem) # flows = mem[mem['cumpnl'] > 0] # flows.sort_values('cumpnl', inplace=False) # print(flows) # # flows = flows[['date', 'variety', 'BrokerID', 'corr', 'corr2', 'today_net', 'net_chg', 'corr3', # 'cumpnl']].sort_values('cumpnl', # inplace=False) # [['date','variety','BrokerID','corr','corr2','cumpnl']] # flows = flows.rename(columns={'today_net': '净持仓', 'cumpnl': '累计盈亏点数', 'net_chg': '净持仓变化量', 'corr3': '相关系数'}) # print(flows[['variety','BrokerID','净持仓','净持仓变化量','累计盈亏点数']]) # print(flows) # print(flows.sort_values('累计盈亏点数')) # mem=mem.groupby() # print(mem) # print(flows['净持仓'].sum()) # mem = mem[-1:] print(mem) df1 = pd.DataFrame(mem) df = df.append(df1) # print(df.tail(20)) except: continue return df
frame # In[93]: np.abs(frame) # In[94]: f = lambda x: x.max() - x.min() # In[95]: frame.apply(f) # In[96]: frame.apply(f, axis = 1) # In[97]: def f(x): return Series([x.min(), x.max()], index=['min', 'max']) # In[98]:
def alpha_diversity(X: pd.DataFrame, metric="richness", taxonomy: pd.DataFrame = None, component_type="OTU", mode="infer", idx_taxonomy=[ 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species' ], name=None, base=2, obsv_type=None, **alpha_kws): """ X => pd.DataFrame of Otu counts with columns as Otus and rows as samples metric => a callable or a string {entropy, richness, gini, singletons} taxonomy => pd.DataFrame of taxonomy assignments for Otus """ # Alpha measures def _entropy(x, **alpha_kws): return stats.entropy(x, base=base, **alpha_kws) def _richness(x, **alpha_kws): return (x > 0).sum().astype(int) def _gini(x, **alpha_kws): return skbio.diversity.alpha.gini_index(x, **alpha_kws) def _singletons(x, **alpha_kws): return (x == 1).sum().astype(int) d_metric_fn = { "entropy": _entropy, "richness": _richness, "gini": _gini, "singletons": _singletons } # Supported diversity measures if hasattr(metric, "__call__"): func = metric else: supported_metrics = list(d_metric_fn.keys()) assert metric in supported_metrics, f"`{metric}` is not compatible. Only available alpha diversity measures are {supported_metrics}" func = d_metric_fn[metric] name = metric # Compute diversity if mode == "infer": mode = "batch" if taxonomy is not None else "singular" assert mode in { "singular", "batch" }, "Please specify either 'singular', 'batch', or 'infer' for the mode" if mode == "singular": Se_alpha = X.apply(lambda x: func(x, **alpha_kws), axis=1) Se_alpha.index.name = name return Se_alpha if mode == "batch": assert taxonomy is not None, "`taxonomy` cannot be `None` when `mode='batch'`" d_level_metric = OrderedDict() for level in idx_taxonomy: if level in taxonomy.columns: df_level = otu_to_level(X, taxonomy, level=level) d_level_metric[level] = alpha_diversity(df_level, metric=metric, taxonomy=None, mode="singular", base=base, **alpha_kws) else: print( f"Skipping taxonomy level `{level}` because it is not in the taxonomy dataframe", file=sys.stderr) d_level_metric[component_type] = alpha_diversity(X, metric=metric, taxonomy=None, mode="singular", base=base, **alpha_kws) df_level_metric = pd.DataFrame(d_level_metric) df_level_metric.index.name = f"id_{obsv_type}" return df_level_metric
def DistanceCalculation(df: DataFrame): df['Miles'] = 0.0 df['Miles'] = df.apply(lambda row: EstimatedDistance(row), axis=1) return df
import numpy as np from pandas import Series from pandas import DataFrame # - ufunc -> element wise array method work fine with pandas objects frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) frame np.abs(frame) # Applying a function on 1D arrays to each column or row f = lambda x: x.max() - x.min() frame.apply(f) frame.apply(f, axis=1) # sum and mean can be applied directly (no need for apply) def f(x): return Series([x.min(), x.max()], index=['min', 'max']) frame.apply(f) # Element-wise python functions can be used too format = lambda x: '%.2f' % x # applymap, not map, because Series.map -> apply function element-wise frame.applymap(format)