def test_dataframe(self, orient, numpy): if orient == "records" and numpy: pytest.skip("Not idiomatic pandas") df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ "a", "b"], columns=["x", "y", "z"]) encode_kwargs = {} if orient is None else dict(orient=orient) decode_kwargs = {} if numpy is None else dict(numpy=numpy) output = ujson.decode(ujson.encode(df, **encode_kwargs), **decode_kwargs) # Ensure proper DataFrame initialization. if orient == "split": dec = _clean_dict(output) output = DataFrame(**dec) else: output = DataFrame(output) # Corrections to enable DataFrame comparison. if orient == "values": df.columns = [0, 1, 2] df.index = [0, 1] elif orient == "records": df.index = [0, 1] elif orient == "index": df = df.transpose() tm.assert_frame_equal(output, df, check_dtype=False)
def testDataFrame(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"]) # column indexed outp = DataFrame(ujson.decode(ujson.encode(df))) self.assertTrue((df == outp).values.all()) assert_array_equal(df.columns, outp.columns) assert_array_equal(df.index, outp.index) outp = DataFrame(**ujson.decode(ujson.encode(df, orient="split"))) self.assertTrue((df == outp).values.all()) assert_array_equal(df.columns, outp.columns) assert_array_equal(df.index, outp.index) outp = DataFrame(ujson.decode(ujson.encode(df, orient="records"))) outp.index = df.index self.assertTrue((df == outp).values.all()) assert_array_equal(df.columns, outp.columns) outp = DataFrame(ujson.decode(ujson.encode(df, orient="values"))) outp.index = df.index self.assertTrue((df.values == outp.values).all()) outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"))) self.assertTrue((df.transpose() == outp).values.all()) assert_array_equal(df.transpose().columns, outp.columns) assert_array_equal(df.transpose().index, outp.index)
def svd_agg(m_rna, mi_rna, targets_matrix, c=1): if settings.CELERY_DEBUG: import sys sys.path.append('/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg') import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) # mRNA_data = m_rna.apply(lambda x: 1.0*x/max(x), axis=0) miRNA_data = mi_rna.apply(lambda x: 1-1.0*x/max(x), axis=0) # aggregate_data = mRNA_data # common_mRNAs = Index(set(mRNA_data.columns) & set(targets_matrix.columns)) common_miRNAs = Index(set(miRNA_data.columns) & set(targets_matrix.index)) # for mRNA in common_mRNAs: # mRNA = Index([mRNA]) # targetting_miRNAs = targets_matrix.ix[targets_matrix[mRNA[0]]==1, mRNA].index # selected_miRNA = miRNA_data.ix[:, targetting_miRNAs].T # if len(selected_miRNA.index) > 1: first_comp = DataFrame(np.linalg.svd(selected_miRNA)[2]).ix[0, :] first_comp.index = selected_miRNA.columns else: continue new_rep = DataFrame(np.linalg.svd(DataFrame([aggregate_data.ix[:, mRNA[0]], first_comp ]))[2]).ix[0, :] new_rep.index = aggregate_data.index aggregate_data.ix[:, mRNA[0]] = new_rep return aggregate_data
def svd_agg_train(m_rna, mi_rna, targets_matrix, hide_columns=Index([])): # sample_indexes = m_rna.index - hide_columns mRNA_data = m_rna.apply(lambda x: 1.0*x/max(x), axis=0).ix[sample_indexes, :] miRNA_data = mi_rna.apply(lambda x: 1-1.0*x/max(x), axis=0).ix[sample_indexes, :] # aggregate_data = mRNA_data # common_mRNAs = Index(set(mRNA_data.columns) & set(targets_matrix.columns)) common_miRNAs = Index(set(miRNA_data.columns) & set(targets_matrix.index)) # for mRNA in common_mRNAs: # mRNA = Index([mRNA]) # targetting_miRNAs = targets_matrix.ix[targets_matrix[mRNA[0]]==1, mRNA].index # selected_miRNA = miRNA_data.ix[:, targetting_miRNAs] # if len(selected_miRNA.columns)>1: first_comp = DataFrame(np.linalg.svd(selected_miRNA)[2]).ix[0, :] first_comp.index = selected_miRNA.index new_rep = DataFrame(np.linalg.svd(DataFrame([aggregate_data.ix[:,mRNA[0]], first_comp ]).transpose())[2]).ix[0, :] new_rep.index = aggregate_data.index aggregate_data.ix[:, mRNA[0]] = new_rep return aggregate_data
def testDataFrame(self): df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) # column indexed outp = DataFrame(ujson.decode(ujson.encode(df))) self.assertTrue((df == outp).values.all()) assert_array_equal(df.columns, outp.columns) assert_array_equal(df.index, outp.index) dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"))) outp = DataFrame(**dec) self.assertTrue((df == outp).values.all()) assert_array_equal(df.columns, outp.columns) assert_array_equal(df.index, outp.index) outp = DataFrame(ujson.decode(ujson.encode(df, orient="records"))) outp.index = df.index self.assertTrue((df == outp).values.all()) assert_array_equal(df.columns, outp.columns) outp = DataFrame(ujson.decode(ujson.encode(df, orient="values"))) outp.index = df.index self.assertTrue((df.values == outp.values).all()) outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"))) self.assertTrue((df.transpose() == outp).values.all()) assert_array_equal(df.transpose().columns, outp.columns) assert_array_equal(df.transpose().index, outp.index)
def set2df(sets, column_names, index=None, sort=True): df = DataFrame(list(sets), columns=column_names, index=index) if sort: df = df.sort(column_names) if index: df.index = index else: df.index = range(len(df)) return df
def test_sort_datetimelike(): # GH10505 # use same data as test_groupby_sort_categorical, which category is # corresponding to datetime.month df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 2, 1), datetime(2011, 1, 1), datetime(2011, 5, 1)], 'foo': [10, 8, 5, 6, 4, 1, 7], 'bar': [10, 20, 30, 40, 50, 60, 70]}, columns=['dt', 'foo', 'bar']) # ordered=True df['dt'] = Categorical(df['dt'], ordered=True) index = [datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1)] result_sort = DataFrame( [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) result_sort.index = CategoricalIndex(index, name='dt', ordered=True) index = [datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1)] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=['foo', 'bar']) result_nosort.index = CategoricalIndex(index, categories=index, name='dt', ordered=True) col = 'dt' assert_frame_equal( result_sort, df.groupby(col, sort=True, observed=False).first()) # when categories is ordered, group is ordered by category's order assert_frame_equal( result_sort, df.groupby(col, sort=False, observed=False).first()) # ordered = False df['dt'] = Categorical(df['dt'], ordered=False) index = [datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1)] result_sort = DataFrame( [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) result_sort.index = CategoricalIndex(index, name='dt') index = [datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1)] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=['foo', 'bar']) result_nosort.index = CategoricalIndex(index, categories=index, name='dt') col = 'dt' assert_frame_equal( result_sort, df.groupby(col, sort=True, observed=False).first()) assert_frame_equal( result_nosort, df.groupby(col, sort=False, observed=False).first())
def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB')) for index in [tm.makeFloatIndex, tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, tm.makePeriodIndex]: df.index = index(len(df)) df.groupby(list('abcde')).apply(lambda x: x) df.index = list(reversed(df.index.tolist())) df.groupby(list('abcde')).apply(lambda x: x)
def deserialize(self, item, force_bytes_to_unicode=False): index = self._index_from_records(item) column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']] multi_column = item.dtype.metadata.get('multi_column') if len(item) == 0: rdata = item[column_fields] if len(column_fields) > 0 else None if multi_column is not None: columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"]) return DataFrame(rdata, index=index, columns=columns) else: return DataFrame(rdata, index=index) columns = item.dtype.metadata['columns'] df = DataFrame(data=item[column_fields], index=index, columns=columns) if multi_column is not None: df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"]) if force_bytes_to_unicode: # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow # of people migrating to py3. # https://github.com/manahl/arctic/issues/598 # This should not be used for a normal flow, and you should instead of writing unicode strings # if you want to work with str in py3., for c in df.select_dtypes(object): # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc' # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc" if type(df[c].iloc[0]) == bytes: df[c] = df[c].str.decode('utf-8') if isinstance(df.index, MultiIndex): unicode_indexes = [] # MultiIndex requires a conversion at each level. for level in range(len(df.index.levels)): _index = df.index.get_level_values(level) if isinstance(_index[0], bytes): _index = _index.astype('unicode') unicode_indexes.append(_index) df.index = unicode_indexes else: if type(df.index[0]) == bytes: df.index = df.index.astype('unicode') if type(df.columns[0]) == bytes: df.columns = df.index.astype('unicode') return df
def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list('AB'), dtype=np.int32) df.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) # From a mixed type dataframe df['A'] = df['A'].astype(np.int16) df['B'] = df['B'].astype(np.float64) result = df.unstack(fill_value=-1) expected['A'] = expected['A'].astype(np.int16) expected['B'] = expected['B'].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list('xyz'), dtype=np.float) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected)
def test_nunique(self): df = DataFrame({ 'A': list('abbacc'), 'B': list('abxacc'), 'C': list('abbacx'), }) expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) result = df.groupby('A', as_index=False).nunique() tm.assert_frame_equal(result, expected) # as_index expected.index = list('abc') expected.index.name = 'A' result = df.groupby('A').nunique() tm.assert_frame_equal(result, expected) # with na result = df.replace({'x': None}).groupby('A').nunique(dropna=False) tm.assert_frame_equal(result, expected) # dropna expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, index=list('abc')) expected.index.name = 'A' result = df.replace({'x': None}).groupby('A').nunique() tm.assert_frame_equal(result, expected)
def twitter_count(keys,d,strdb): #Mongo connection = pymongo.MongoClient(keys['db']['host']) dbtm = connection[keys['db']['name']] db = dbtm[strdb] #MongoDB Query - Mentions #The Day Of upper_bound_start_ts = float(calendar.timegm(d[-1].utctimetuple())*1000); upper_bound_end = d[-1] + timedelta(days=1); upper_bound_end_ts = float(calendar.timegm(upper_bound_end.utctimetuple())*1000) #upper_bound_end_ts = float(calendar.timegm(d[-1].utctimetuple())*1000); upper_bound_start = d[-1] - timedelta(days=1); upper_bound_start_ts = float(calendar.timegm(upper_bound_start.utctimetuple())*1000) # #Retrieve Tweeets that are not authored by the user itself. if strdb in 'mentions': tr = db.aggregate([ {'$match': {'timestamp':{'$gt': upper_bound_start_ts, '$lt': upper_bound_end_ts}}}, {'$unwind':'$cdpid'}, {'$group':{'_id':'$cdpid',strdb:{'$sum':1}}}]) #Tweets collection does not need unwind unlike mentions collection. else: tr = db.aggregate([ {'$match': {'timestamp':{'$gt': upper_bound_start_ts, '$lt': upper_bound_end_ts}}}, {'$group':{'_id':'$cdpid',strdb:{'$sum':1}}}]) tr = DataFrame(tr['result']); tr.index = tr._id; tr=tr.drop('_id',axis=1); tr = tr.sort_index(); #mts['Date'] = Period(d[-2],'D') print '%s for ' %(strdb), d[-1], ' processed' return(tr)
def test_merge_datetime_index(self, box): # see gh-19038 df = DataFrame([1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]) df.index = pd.to_datetime(df.index) on_vector = df.index.year if box is not None: on_vector = box(on_vector) expected = DataFrame( OrderedDict([ ("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018]), ]) ) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( OrderedDict([ ("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3]), ]) ) result = df.merge(df, on=[df.index.year], how="inner") tm.assert_frame_equal(result, expected)
def test_wls_panel(self): y = tm.makeTimeDataFrame() x = Panel({"x1": tm.makeTimeDataFrame(), "x2": tm.makeTimeDataFrame()}) y.ix[[1, 7], "A"] = np.nan y.ix[[6, 15], "B"] = np.nan y.ix[[3, 20], "C"] = np.nan y.ix[[5, 11], "D"] = np.nan stack_y = y.stack() stack_x = DataFrame(dict((k, v.stack()) for k, v in compat.iteritems(x))) weights = x.std("items") stack_weights = weights.stack() stack_y.index = stack_y.index._tuple_index stack_x.index = stack_x.index._tuple_index stack_weights.index = stack_weights.index._tuple_index with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=y, x=x, weights=1 / weights) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights) assert_almost_equal(result.beta, expected.beta) for attr in ["resid", "y_fitted"]: rvals = getattr(result, attr).stack().values evals = getattr(expected, attr).values assert_almost_equal(rvals, evals)
def predict(self, tree): """ TODO Should take an array and predict every item. A score can be stored. It would follow the guidelines set by scikit-learn. """ tree_rules = self.extract_rules(tree) df = DataFrame(columns=['label', 'prob']) gb = self.posteriori.groupby('label') for key, indexes in gb.groups.items(): apriori_prob = self.apriori[self.apriori.label == key]['freq'].values[0] prob = apriori_prob group_df, missing_prob = self.apply_smoothing(self.posteriori.ix[indexes], tree_rules) for rule in tree_rules: prob_evidence = group_df[group_df.rule == rule]['freq'] if len(prob_evidence) == 0: prob_evidence = missing_prob else: prob_evidence = prob_evidence.values[0] prob *= prob_evidence post = DataFrame({'label':[key], 'prob':[prob]}) df = df.append(post) df.index = np.arange(df.index.size) df = df.sort(columns='prob', ascending=False) return df.ix[df['prob'].idxmax()]
def test_basic(self, sparse, dtype): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=self.effective_dtype(dtype)) result = get_dummies(s_list, sparse=sparse, dtype=dtype) if sparse: tm.assert_sp_frame_equal(result, expected.to_sparse(kind='integer', fill_value=0)) else: assert_frame_equal(result, expected) result = get_dummies(s_series, sparse=sparse, dtype=dtype) if sparse: expected = expected.to_sparse(kind='integer', fill_value=0) assert_frame_equal(result, expected) expected.index = list('ABC') result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) if sparse: expected.to_sparse(kind='integer', fill_value=0) assert_frame_equal(result, expected)
def test_wls_panel(self): y = tm.makeTimeDataFrame() x = Panel({'x1': tm.makeTimeDataFrame(), 'x2': tm.makeTimeDataFrame()}) y.iloc[[1, 7], y.columns.get_loc('A')] = np.nan y.iloc[[6, 15], y.columns.get_loc('B')] = np.nan y.iloc[[3, 20], y.columns.get_loc('C')] = np.nan y.iloc[[5, 11], y.columns.get_loc('D')] = np.nan stack_y = y.stack() stack_x = DataFrame(dict((k, v.stack()) for k, v in x.iteritems())) weights = x.std('items') stack_weights = weights.stack() stack_y.index = stack_y.index._tuple_index stack_x.index = stack_x.index._tuple_index stack_weights.index = stack_weights.index._tuple_index with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=y, x=x, weights=1 / weights) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights) assert_almost_equal(result.beta, expected.beta) for attr in ['resid', 'y_fitted']: rvals = getattr(result, attr).stack().values evals = getattr(expected, attr).values assert_almost_equal(rvals, evals)
def test_wls_panel(self): y = tm.makeTimeDataFrame() x = Panel({"x1": tm.makeTimeDataFrame(), "x2": tm.makeTimeDataFrame()}) y.ix[[1, 7], "A"] = np.nan y.ix[[6, 15], "B"] = np.nan y.ix[[3, 20], "C"] = np.nan y.ix[[5, 11], "D"] = np.nan stack_y = y.stack() stack_x = DataFrame(dict((k, v.stack()) for k, v in x.iteritems())) weights = x.std("items") stack_weights = weights.stack() stack_y.index = stack_y.index.get_tuple_index() stack_x.index = stack_x.index.get_tuple_index() stack_weights.index = stack_weights.index.get_tuple_index() result = ols(y=y, x=x, weights=1 / weights) expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights) assert_almost_equal(result.beta, expected.beta) for attr in ["resid", "y_fitted"]: rvals = getattr(result, attr).stack().values evals = getattr(expected, attr).values assert_almost_equal(rvals, evals)
def bdib(self, ticker, fld_list, startDateTime, endDateTime, eventType='TRADE', interval = 1): """ Get one ticker (Only one ticker available per call); eventType (TRADE, BID, ASK,..etc); interval (in minutes) ; fld_list (Only [open, high, low, close, volumne, numEvents] availalbe) return pandas dataframe with return Data """ # Create and fill the request for the historical data request = self.refDataService.createRequest("IntradayBarRequest") request.set("security", ticker) request.set("eventType", eventType) request.set("interval", interval) # bar interval in minutes request.set("startDateTime", startDateTime) request.set("endDateTime", endDateTime) print "Sending Request:", request # Send the request self.session.sendRequest(request) # defaultdict - later convert to pandas data = defaultdict(dict) # Process received events while(True): # We provide timeout to give the chance for Ctrl+C handling: ev = self.session.nextEvent(500) for msg in ev: barTickData = msg.getElement('barData').getElement('barTickData') for i in range(barTickData.numValues()) : for j in range(len(fld_list)) : data[(fld_list[j])][barTickData.getValue(i).getElement(0).getValue()] = barTickData.getValue(i).getElement(fld_list[j]).getValue() if ev.eventType() == blpapi.Event.RESPONSE: # Response completly received, so we could exit break data = DataFrame(data) data.index = pd.to_datetime(data.index) return data
def test_fenci(): dfs = [] for i in range(0, 9): f = file('Data/ftags_{}.pkl'.format(i), 'rb') fdist = pickle.load(f) #fdist.plot(50) df = DataFrame(fdist.items(), columns=['关键词', '计数']) df = df.sort_index(by='计数', ascending=False) df.index = range(len(df)) df_plt = df[:30] df_plt = df_plt[::-1] #df_plt['关键词'].apply(lambda x : x.encode('utf8')) print df_plt.head() df_plt.plot(kind='barh', x=df_plt['关键词'], title=classifies[i]) #plt.show() filePath = 'Data/{}.png'.format(classifies[i]) str_name_f = filePath.decode("utf8") plt.savefig(str_name_f, dpi=100) dfs.append((classifies[i],df)) #print df[df[1] > 1] f.close() print 'end' with pd.ExcelWriter('Data/keys.xlsx') as writer: for key, df in dfs: print key df.to_excel(writer, sheet_name=key, index=False)
def test_wls_panel(self): y = tm.makeTimeDataFrame() x = Panel({'x1' : tm.makeTimeDataFrame(), 'x2' : tm.makeTimeDataFrame()}) y.ix[[1, 7], 'A'] = np.nan y.ix[[6, 15], 'B'] = np.nan y.ix[[3, 20], 'C'] = np.nan y.ix[[5, 11], 'D'] = np.nan stack_y = y.stack() stack_x = DataFrame(dict((k, v.stack()) for k, v in x.iteritems())) weights = x.std('items') stack_weights = weights.stack() stack_y.index = stack_y.index.get_tuple_index() stack_x.index = stack_x.index.get_tuple_index() stack_weights.index = stack_weights.index.get_tuple_index() result = ols(y=y, x=x, weights=1/weights) expected = ols(y=stack_y, x=stack_x, weights=1/stack_weights) assert_almost_equal(result.beta, expected.beta) for attr in ['resid', 'y_fitted']: rvals = getattr(result, attr).stack().values evals = getattr(expected, attr).values assert_almost_equal(rvals, evals)
def data_frame(self): if self._processed_knockouts is None: self._process_knockouts() data_frame = DataFrame(self._processed_knockouts) data_frame.sort_values("size", inplace=True) data_frame.index = [i for i in range(len(data_frame))] return data_frame
def make_plot(): # get list of the checked features features = request.form.getlist('feature') # capture the ticker input from the user ticker = request.form['ticker'] # calculate one month time period from now now = datetime.now() #end_date = now.strftime('%Y-%m-%d') start_date = (now - timedelta(days=30)).strftime('%Y-%m-%d') # fetch the appropriate dataset via API URL = 'https://www.quandl.com/api/v3/datasets/WIKI/'+ticker+'.json?start_date='+start_date+'&end_date='+end_date+'&order=asc&api_key=eFoXAcyvLhyuB3Rsvg6o' # URL = 'https://www.quandl.com/api/v3/datasets/WIKI/'+ticker+'.json?start_date=2015-08-01&end_date=2015-09-01&order=asc&api_key=eFoXAcyvLhyuB3Rsvg6o' r = requests.get(URL) # convert into a pandas dataframe request_df = DataFrame(r.json()) df = DataFrame(request_df.ix['data','dataset'], columns = request_df.ix['column_names','dataset']) df.columns = [x.lower() for x in df.columns] df = df.set_index(['date']) df.index = to_datetime(df.index) # create a Bokeh plot from the dataframe # output_file("stock.html", title="Stock prices changes for last month") p = figure(x_axis_type = "datetime") if 'open' in features: p.line(df.index, df['open'], color='blue', legend='opening price') if 'high' in features: p.line(df.index, df['high'], color='red', legend='highest price') if 'close' in features: p.line(df.index, df['close'], color='green', legend='closing price') return p
def plotting(): # get list of the checked features features = request.form.getlist('feature') #user's input ticker = request.form['ticker'] #calculate the time one month before now = datetime.now() #calculate the time difference start_date = (now - timedelta(days=30)).strftime('%Y-%m-%d') #fetch the dataset URL = 'https://www.quandl.com/api/v3/datasets/WIKI/'+ticker+'.json?start_date='+start_date+'&end_date='+end_date+'&order=asc&api_key=WVEFZw8uyJzuvHE3VsQW' r = requests.get(URL) #pass to pandas dataframe raw_data = DataFrame(r.json()) #clean up the data df = DataFrame(raw_data.ix['data','dataset'] , columns = raw_data.ix['column_names','dataset']) #set the column names with lower case df.columns = [x.lower() for x in df.columns] #set the index to the date column df = df.set_index(['date']) #convert the index to datetime df.index = to_datetime(df.index) #create the plot p = figure(x_axis_type = "datetime") if 'open' in features: p.line(df.index, df['open'], color='blue', legend='opening price') if 'high' in features: p.line(df.index, df['high'], color='red', legend='highest price') if 'close' in features: p.line(df.index, df['close'], color='green', legend='closing price') return p
def output(): # getting user set options from the index2.html page options = request.form.getlist('feature') stock = request.form['stock'] stock = stock.upper() # requesting data from Quandl nw = datetime.now() start_date = (nw - timedelta(days=30)).strftime('%Y-%m-%d') end_date = nw.strftime('%Y-%m-%d') req_url = 'https://www.quandl.com/api/v3/datasets/WIKI/'+stock+'.json?start_date='+start_date+'&end_date='+end_date+'&order=asc&api_key=3bkydVzcH_PPsy5zzAPn' r = requests.get(req_url) # pandas in action request_df = DataFrame(r.json()) df = DataFrame(request_df.ix['data','dataset'], columns = request_df.ix['column_names','dataset']) df.columns = [x.lower() for x in df.columns] df = df.set_index(['date']) df.index = to_datetime(df.index) # create plot - PLAY AROUND WITH THIS TO MAKE IT GENUINE #output_file("output.html", title="Stock prices changes for last month") p = figure(x_axis_type = "datetime") if 'open' in options: p.line(df.index, df['open'], color='black', legend='Opening price') if 'high' in options: p.line(df.index, df['high'], color='red', legend='Highest price') if 'close' in options: p.line(df.index, df['close'], color='blue', legend='Closing price') return p
def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list("AB"), dtype=np.int32) df.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list("xyz"), dtype=np.int32) expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]) assert_frame_equal(result, expected) # From a mixed type dataframe df["A"] = df["A"].astype(np.int16) df["B"] = df["B"].astype(np.float64) result = df.unstack(fill_value=-1) expected["A"] = expected["A"].astype(np.int16) expected["B"] = expected["B"].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list("xyz"), dtype=np.float) expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]) assert_frame_equal(result, expected)
def get_regression_table(self): regression_table = DataFrame({"beta": self.coef, "std_X": self.std_X}) regression_table.index = self.features regression_table['beta_normalized'] = regression_table.beta * regression_table.std_X regression_table['effect'] = np.fabs(regression_table['beta_normalized']) regression_table = regression_table.sort_index(by='effect', ascending=False) return regression_table
def make_plot(): types = request.form.getlist("type") ticker = request.form["ticker"] now = datetime.now() end_date = now.strftime("%Y-%m-%d") start_date = (now - timedelta(days=180)).strftime("%Y-%m-%d") # six - month timeframe URL = ( "https://www.quandl.com/api/v3/datasets/WIKI/" + ticker + ".json?start_date=" + start_date + "&end_date=" + end_date + "&order=asc&api_key=eFoXAcyvLhyuB3Rsvg6o" ) r = requests.get(URL) df_handle = DataFrame(r.json()) df = DataFrame(df_handle.ix["data", "dataset"], columns=df_handle.ix["column_names", "dataset"]) df.columns = [x.lower() for x in df.columns] df = df.set_index(["date"]) df.index = to_datetime(df.index) p = figure(x_axis_type="datetime") if "open" in types: p.line(df.index, df["open"], color="blue", legend="opening price") if "high" in types: p.line(df.index, df["high"], color="red", legend="highest price") if "close" in types: p.line(df.index, df["close"], color="green", legend="closing price") return p
def fetchQuotes(sym, start=FROM_DATE, end=CURRENT_DATE): his = None data = None try: # print start, end data = ystockquote.get_historical_prices(sym, start, end) except Exception: print "Please check the dates. Data might not be available. 404 returned" # 404 due to data yet not available if data: his = DataFrame(collections.OrderedDict(sorted(data.items()))).T his = his.convert_objects(convert_numeric=True) his.index = pd.to_datetime(his.index) his.insert(0, 'symbol', sym, allow_duplicates=True) # insert the date as dataframe too his.insert(1, 'date', his.index) # his.columns = getColumns('stock_quote_historical') # Removing as db dependency is removed his.columns = getColumnsNoSql('stock_quote_historical') daily = ystockquote.get_all(sym) # print daily # persist(his, daily, sym, end) return his, daily
def __init__(self, outcomes, texts, parameters_display, verbose=False): options = {"lowercase": True, "lemmatize": True, "remove-stopwords": True} super(DisplayTextModel, self).__init__(outcomes, texts, 'bag-of-words', options) data = DataFrame({"y": outcomes, "text": texts}) # Storing whether the outcome is a dummy: if set(data.y) == set([0, 1]): self.is_dummy_outcome = True N = data.shape[0] self.number_of_observations = N data.index = [str(x) for x in range(N)] data['y_hat'] = self.pipe.predict(texts) ridge = self.pipe.named_steps['ridge_model'] self.std_X = ridge.std_X self.parameters_display = parameters_display self.mean_outcome_in_groups = mean_outcome_in_groups(data.y, data.y_hat) self.percent_correct = share_correct(data.y, data.y_hat, verbose=verbose) self.outcome_summary = get_summary(outcomes) self.coef = ridge.coef_ self.number_of_features = len(self.coef) features = self.pipe.named_steps['featurizer'].get_feature_names() self.features = [f.split("__")[1] for f in features]
) + '" target="_blank">' + targetpage.geturl() + '</a>' print(messages) else: result.loc[result['misspelling'] == rowdata[0], 'wiki'] = True #result.loc[result['misspelling'] == rowdata[0], 'wikiurl'] = browser.current_url result.loc[result['misspelling'] == rowdata[0], 'wikiurl'] = targetpage.geturl() messages = '[OK] ' + rowdata[ 0] + ': Found\n + Link: ' + targetpage.geturl() print(messages) output = output + '\n' + messages # Sorting result values result.sort_values(by=['duplication', 'wiki', 'misspelling', 'url'], ascending=[True, True, True, True], inplace=True) result.index = range(len(result)) # Exporting to csv file result.to_csv(outputname, header=True, index=True) #print (result.to_string()) #output = output + '\n' + result.to_string() f.write(output) f.close() else: f = open(logname, 'w') messages = '[ERR] Initialization faliure' print(messages) output = output + '\n' + messages f.write(output) f.close()
def classify_otus_experimental( representative_sequences: DNASequencesDirectoryFormat, tree: NewickFormat, reference_taxonomy: pd.DataFrame = None) -> pd.DataFrame: if reference_taxonomy is None: filename_default_taxonomy = os.path.join(_sepp_refs_path(), 'taxonomy_gg99.qza') reference_taxonomy = Artifact.load(filename_default_taxonomy).view( pd.DataFrame) # convert type of feature IDs to str (depending on pandas type inference # they might come as integers), to make sure they are of the same type as # in the tree. reference_taxonomy.index = map(str, reference_taxonomy.index) # load the insertion tree tree = skbio.TreeNode.read(str(tree)) # ensure that all reference tips in the tree (those without the inserted # fragments) have a mapping in the user provided taxonomy table names_tips = {node.name for node in tree.tips()} names_fragments = { fragment.metadata['id'] for fragment in representative_sequences.file.view(DNAIterator) } missing_features = (names_tips - names_fragments) -\ set(reference_taxonomy.index) if len(missing_features) > 0: # QIIME2 users can run with --verbose and see stderr and stdout. # Thus, we here report more details about the mismatch: sys.stderr.write( ("The taxonomy artifact you provided does not contain lineage " "information for the following %i features:\n%s") % (len(missing_features), "\n".join(missing_features))) raise ValueError("Not all OTUs in the provided insertion tree have " "mappings in the provided reference taxonomy.") taxonomy = [] for fragment in representative_sequences.file.view(DNAIterator): # for every inserted fragment we now try to find the closest OTU tip # in the tree and available mapping from the OTU-ID to a lineage # string: lineage_str = np.nan # first, let us check if the fragment has been inserted at all ... try: curr_node = tree.find(fragment.metadata['id']) except skbio.tree.MissingNodeError: continue # if yes, we start from the inserted node and traverse the tree as less # as possible towards the root and check at every level if one or # several OTU-tips are within the sub-tree. if curr_node is not None: foundOTUs = [] # Traversal is stopped at a certain level, if one or more OTU-tips # have been found in the sub-tree OR ... (see break below) while len(foundOTUs) == 0: # SEPP insertion - especially for multiple very similar # sequences - can result in a rather complex topology change # if all those sequences are inserted into the same branch # leading to one OTU-tip. Thus, we cannot simply visit only # all siblings or decendents and rather need to traverse the # whole sub-tree. Average case should be well behaved, # thus I think it is ok. for node in curr_node.postorder(): if (node.name is not None) and \ (node.name in reference_taxonomy.index): # if a suitable OTU-tip node is found AND this OTU-ID # has a mapping in the user provided reference_taxonomy # we store the OTU-ID in the growing result list foundOTUs.append(node.name) # ... if the whole tree has been traversed without success, # e.g. if user provided reference_taxonomy did not contain any # matching OTU-IDs. if curr_node.is_root(): break # prepare next while iteration, by changing to the parent node curr_node = curr_node.parent if len(foundOTUs) > 0: # If the above method has identified exactly one OTU-tip, # resulting lineage string would simple be the one provided by # the user reference_taxonomy. However, if the inserted # fragment cannot unambiguously places into the reference tree, # the above method will find multiple OTU-IDs, which might have # lineage strings in the user provided reference_taxonomy that # are similar up to a certain rank and differ e.g. for genus # and species. # Thus, we here find the longest common prefix of all lineage # strings. We don't operate per character, but per taxonomic # rank. Therefore, we first "convert" every lineage sting into # a list of taxa, one per rank. split_lineages = [] for otu in foundOTUs: # find lineage string for OTU lineage = reference_taxonomy.loc[otu, 'Taxon'] # necessary to split lineage apart to ensure that # the longest common prefix operates on atomic ranks # instead of characters split_lineages.append( list(map(str.strip, lineage.split(';')))) # find the longest common prefix rank-wise and concatenate to # one lineage string, separated by ; lineage_str = "; ".join(os.path.commonprefix(split_lineages)) taxonomy.append({ 'Feature ID': fragment.metadata['id'], 'Taxon': lineage_str }) pd_taxonomy = pd.DataFrame(taxonomy) # test if dataframe is completely empty, or if no lineages could be found if (len(taxonomy) == 0) or \ (pd_taxonomy['Taxon'].dropna().shape[0] == 0): raise ValueError( ("None of the representative-sequences can be found in the " "insertion tree. Please double check that both inputs match up, " "i.e. are results from the same 'sepp' run.")) return pd_taxonomy.set_index('Feature ID')
def test_non_cython_api(): # GH5610 # non-cython calls should not include the grouper df = DataFrame( [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], columns=["A", "B", "C"] ) g = df.groupby("A") gni = df.groupby("A", as_index=False) # mad expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" result = g.mad() tm.assert_frame_equal(result, expected) expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1]) result = gni.mad() tm.assert_frame_equal(result, expected) # describe expected_index = Index([1, 3], name="A") expected_col = pd.MultiIndex( levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], codes=[[0] * 8, list(range(8))], ) expected = DataFrame( [ [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], ], index=expected_index, columns=expected_col, ) result = g.describe() tm.assert_frame_equal(result, expected) expected = pd.concat( [ df[df.A == 1].describe().unstack().to_frame().T, df[df.A == 3].describe().unstack().to_frame().T, ] ) expected.index = Index([0, 1]) result = gni.describe() tm.assert_frame_equal(result, expected) # any expected = DataFrame( [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] ) expected.index.name = "A" result = g.any() tm.assert_frame_equal(result, expected) # idxmax expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" result = g.idxmax() tm.assert_frame_equal(result, expected)
def test_boolean_comparison(self): # GH 4576 # boolean comparisons with a tuple/list give unexpected results df = DataFrame(np.arange(6).reshape((3, 2))) b = np.array([2, 2]) b_r = np.atleast_2d([2, 2]) b_c = b_r.T l = (2, 2, 2) tup = tuple(l) # gt expected = DataFrame([[False, False], [False, True], [True, True]]) result = df > b assert_frame_equal(result, expected) result = df.values > b assert_numpy_array_equal(result, expected.values) result = df > l assert_frame_equal(result, expected) result = df > tup assert_frame_equal(result, expected) result = df > b_r assert_frame_equal(result, expected) result = df.values > b_r assert_numpy_array_equal(result, expected.values) pytest.raises(ValueError, df.__gt__, b_c) pytest.raises(ValueError, df.values.__gt__, b_c) # == expected = DataFrame([[False, False], [True, False], [False, False]]) result = df == b assert_frame_equal(result, expected) result = df == l assert_frame_equal(result, expected) result = df == tup assert_frame_equal(result, expected) result = df == b_r assert_frame_equal(result, expected) result = df.values == b_r assert_numpy_array_equal(result, expected.values) pytest.raises(ValueError, lambda: df == b_c) assert not np.array_equal(df.values, b_c) # with alignment df = DataFrame(np.arange(6).reshape((3, 2)), columns=list('AB'), index=list('abc')) expected.index = df.index expected.columns = df.columns result = df == l assert_frame_equal(result, expected) result = df == tup assert_frame_equal(result, expected)
def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack(fill_value=-1) expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]}, index=['x', 'y', 'z'], dtype=np.int16) assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]}, index=['x', 'y', 'z'], dtype=np.float) assert_frame_equal(result, expected) # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list('AB'), dtype=np.int32) df.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) # From a mixed type dataframe df['A'] = df['A'].astype(np.int16) df['B'] = df['B'].astype(np.float64) result = df.unstack(fill_value=-1) expected['A'] = expected['A'].astype(np.int16) expected['B'] = expected['B'].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list('xyz'), dtype=np.float) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) # Test unstacking with date times dv = pd.date_range('2012-01-01', periods=4).values data = Series(dv) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]], 'b': [dv[1], dv[2], pd.NaT]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=dv[0]) expected = DataFrame({'a': [dv[0], dv[0], dv[3]], 'b': [dv[1], dv[2], dv[0]]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) # Test unstacking with time deltas td = [Timedelta(days=i) for i in range(4)] data = Series(td) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame({'a': [td[0], pd.NaT, td[3]], 'b': [td[1], td[2], pd.NaT]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=td[1]) expected = DataFrame({'a': [td[0], td[1], td[3]], 'b': [td[1], td[2], td[1]]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) # Test unstacking with period periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'), Period('2012-04')] data = Series(periods) data.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = data.unstack() expected = DataFrame({'a': [periods[0], None, periods[3]], 'b': [periods[1], periods[2], None]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) result = data.unstack(fill_value=periods[1]) expected = DataFrame({'a': [periods[0], periods[1], periods[3]], 'b': [periods[1], periods[2], periods[1]]}, index=['x', 'y', 'z']) assert_frame_equal(result, expected) # Test unstacking with categorical data = pd.Series(['a', 'b', 'c', 'a'], dtype='category') data.index = pd.MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) # By default missing values will be NaN result = data.unstack() expected = DataFrame({'a': pd.Categorical(list('axa'), categories=list('abc')), 'b': pd.Categorical(list('bcx'), categories=list('abc'))}, index=list('xyz')) assert_frame_equal(result, expected) # Fill with non-category results in NaN entries similar to above result = data.unstack(fill_value='d') assert_frame_equal(result, expected) # Fill with category value replaces missing values as expected result = data.unstack(fill_value='c') expected = DataFrame({'a': pd.Categorical(list('aca'), categories=list('abc')), 'b': pd.Categorical(list('bcc'), categories=list('abc'))}, index=list('xyz')) assert_frame_equal(result, expected)
def anova2_lm_single(model, design_info, n_rows, test, pr_test, robust): """ Anova type II table for one fitted linear model. Parameters ---------- model : fitted linear model results instance A fitted linear model **kwargs** scale : float Estimate of variance, If None, will be estimated from the largest model. Default is None. test : str {"F", "Chisq", "Cp"} or None Test statistics to provide. Default is "F". Notes ----- Use of this function is discouraged. Use anova_lm instead. Type II Sum of Squares compares marginal contribution of terms. Thus, it is not particularly useful for models with significant interaction terms. """ terms_info = design_info.terms[:] # copy terms_info = _remove_intercept_patsy(terms_info) names = ['sum_sq', 'df', test, pr_test] table = DataFrame(np.zeros((n_rows, 4)), columns=names) cov = _get_covariance(model, None) robust_cov = _get_covariance(model, robust) col_order = [] index = [] for i, term in enumerate(terms_info): # grab all varaibles except interaction effects that contain term # need two hypotheses matrices L1 is most restrictive, ie., term==0 # L2 is everything except term==0 cols = design_info.slice(term) L1 = lrange(cols.start, cols.stop) L2 = [] term_set = set(term.factors) for t in terms_info: # for the term you have other_set = set(t.factors) if term_set.issubset(other_set) and not term_set == other_set: col = design_info.slice(t) # on a higher order term containing current `term` L1.extend(lrange(col.start, col.stop)) L2.extend(lrange(col.start, col.stop)) L1 = np.eye(model.model.exog.shape[1])[L1] L2 = np.eye(model.model.exog.shape[1])[L2] if L2.size: LVL = np.dot(np.dot(L1, robust_cov), L2.T) from scipy import linalg orth_compl, _ = linalg.qr(LVL) r = L1.shape[0] - L2.shape[0] # L1|2 # use the non-unique orthogonal completion since L12 is rank r L12 = np.dot(orth_compl[:, -r:].T, L1) else: L12 = L1 r = L1.shape[0] #from IPython.core.debugger import Pdb; Pdb().set_trace() if test == 'F': f = model.f_test(L12, cov_p=robust_cov) table.loc[table.index[i], test] = test_value = f.fvalue table.loc[table.index[i], pr_test] = f.pvalue # need to back out SSR from f_test table.loc[table.index[i], 'df'] = r col_order.append(cols.start) index.append(term.name()) table.index = Index(index + ['Residual']) table = table.iloc[np.argsort(col_order + [model.model.exog.shape[1] + 1])] # back out sum of squares from f_test ssr = table[test] * table['df'] * model.ssr / model.df_resid table['sum_sq'] = ssr # fill in residual table.loc['Residual', ['sum_sq', 'df', test, pr_test]] = (model.ssr, model.df_resid, np.nan, np.nan) return table
def test_margin_normalize(self): # GH 27500 df = DataFrame( { "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], "C": [ "small", "large", "large", "small", "small", "large", "small", "small", "large", ], "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], } ) # normalize on index result = crosstab( [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 ) expected = DataFrame( [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] ) expected.index = MultiIndex( levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], names=["A", "B"], ) expected.columns = Index(["large", "small"], dtype="object", name="C") tm.assert_frame_equal(result, expected) # normalize on columns result = crosstab( [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 ) expected = DataFrame( [ [0.25, 0.2, 0.222222], [0.25, 0.2, 0.222222], [0.5, 0.2, 0.333333], [0, 0.4, 0.222222], ] ) expected.columns = Index( ["large", "small", "Sub-Total"], dtype="object", name="C" ) expected.index = MultiIndex( levels=[["bar", "foo"], ["one", "two"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=["A", "B"], ) tm.assert_frame_equal(result, expected) # normalize on both index and column result = crosstab( [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True ) expected = DataFrame( [ [0.111111, 0.111111, 0.222222], [0.111111, 0.111111, 0.222222], [0.222222, 0.111111, 0.333333], [0.000000, 0.222222, 0.222222], [0.444444, 0.555555, 1], ] ) expected.columns = Index( ["large", "small", "Sub-Total"], dtype="object", name="C" ) expected.index = MultiIndex( levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], names=["A", "B"], ) tm.assert_frame_equal(result, expected)
def textual_update_analysis( df: pd.DataFrame, extra_columns: List ) -> Tuple[Dict[str, str], List[Dict[str, Union[str, float]]]]: template_vars: Dict[str, Any] = {} summary = [] df = df.rename(columns={'date': 'ds', 'search_downloads': 'y'}) if 'asa' in df.columns: df['y'] = df['y'] - df['asa'] df.index = df['ds'] df = handle_outliers(df) if options.weekly: df = df.resample('W').apply(safe_mean) time_regressors = [] for _, row in df.iterrows(): if row['update'] == 'textual': additional_regressor = '{} (text)'.format( str(row['ds']).split(" ")[0]) df[additional_regressor] = [ other_row['y'] if other_row['ds'] >= row['ds'] else 0 for _, other_row in df.iterrows() ] time_regressors.append(additional_regressor) model = create_model('sherlock_textual', df, True, time_regressors + extra_columns) model.fit(10000 if options.sampler == 'metropolis' else 2000, method=Sampler.METROPOLIS if options.sampler == 'metropolis' else Sampler.NUTS, step_kwargs={'compute_convergence_checks': False} if options.sampler == 'metropolis' else {}) fig = plot_nowcast( model, [row['ds'] for _, row in df.iterrows() if row['update'] == 'textual']) plt.title('Downloads & Textual Updates') template_vars['textual_model'] = figure_to_base64(fig) summary.extend( summary_from_model_regressors(model, time_regressors + extra_columns)) extra_regressors_plots: List[Dict[str, str]] = [] for i in range(len(time_regressors), len(time_regressors) + len(extra_columns)): fig = plt.figure() plt.grid() plt.hist(model.trace['regressors_{}'.format(model.name)][:, i] * 100, bins=30, alpha=0.8, histtype='stepfilled') plt.axvline( np.median(model.trace['regressors_{}'.format(model.name)][:, i]) * 100, color="C3", lw=1, ls="dotted") plt.title("{} (in %)".format(extra_columns[i - len(time_regressors)])) extra_regressors_plots.append({ 'name': extra_columns[i - len(time_regressors)], 'img_data': figure_to_base64(fig) }) template_vars['extra_regressors_plots'] = extra_regressors_plots seasonality = {} for period, fig in plot_seasonality(model, alpha=options.alpha, plot_kwargs={}).items(): seasonality[int(period)] = figure_to_base64(fig) template_vars['textual_seasonality'] = seasonality return template_vars, summary
def pool_duplicate_subsets( data: pd.DataFrame, col_dupl_thresh: float = 0.2, subset_thresh: float = 0.2, min_col_pool: int = 3, exclude: Optional[List[str]] = None, return_details=False, ) -> pd.DataFrame: """ Checks for duplicates in subsets of columns and pools them. This can reduce \ the number of columns in the data without loosing much information. Suitable \ columns are combined to subsets and tested for duplicates. In case sufficient \ duplicates can be found, the respective columns are aggregated into a \ "pooled_var" column. Identical numbers in the "pooled_var" column indicate \ identical information in the respective rows. Note: It is advised to exclude features that provide sufficient informational \ content by themselves as well as the target column by using the "exclude" \ setting. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame col_dupl_thresh : float, optional Columns with a ratio of duplicates higher than "col_dupl_thresh" are \ considered in the further analysis. Columns with a lower ratio are not \ considered for pooling, by default 0.2 subset_thresh : float, optional The first subset with a duplicate threshold higher than "subset_thresh" is \ chosen and aggregated. If no subset reaches the threshold, the algorithm \ continues with continuously smaller subsets until "min_col_pool" is reached, \ by default 0.2 min_col_pool : int, optional Minimum number of columns to pool. The algorithm attempts to combine as many \ columns as possible to suitable subsets and stops when "min_col_pool" is \ reached, by default 3 exclude : Optional[List[str]], optional List of column names to be excluded from the analysis. These columns are \ passed through without modification, by default None return_details : bool, optional Provdies flexibility to return intermediary results, by default False Returns ------- pd.DataFrame DataFrame with low cardinality columns pooled optional: subset_cols: List of columns used as subset """ # Input validation _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1) _validate_input_range(subset_thresh, "subset_thresh", 0, 1) _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1]) excluded_cols = [] if exclude is not None: excluded_cols = data[exclude] data = data.drop(columns=exclude) subset_cols = [] for i in range(data.shape[1] + 1 - min_col_pool): check_list = [ col for col in data.columns if data.duplicated(subset=col).mean() > col_dupl_thresh ] if len(check_list) > 0: combinations = itertools.combinations(check_list, len(check_list) - i) else: continue ratios = [ *map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations) ] max_ratio = max(ratios) max_idx = np.argmax(ratios) if max_ratio > subset_thresh: best_subset = itertools.islice( itertools.combinations(check_list, len(check_list) - i), max_idx, max_idx + 1, ) best_subset = data[list(list(best_subset)[0])] subset_cols = best_subset.columns.tolist() unique_subset = ( best_subset.drop_duplicates().reset_index().rename( columns={"index": "pooled_vars"})) data = data.merge(unique_subset, how="left", on=best_subset.columns.tolist()).drop( columns=best_subset.columns.tolist()) data.index = pd.RangeIndex(len(data)) break data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1) if return_details: return data, subset_cols return data
colourlist = [ '#0033CC', '#33CC33', '#FFAA00', '#CC3300', '#AAAAAA', '#0032FF', 'r', 'c', 'm', 'y', '#000000', '#333333' ] #colourlist = ['#AAAAAA','# #filename = '/tier2/dickson/bathd/FlyMAD/JAABA_tracking/140927/wing_angles_nano.csv' #binsize = '5s' # ex: '1s' or '4Min' etc #BAG_FILE = '/groups/dickson/home/bathd/Dropbox/140927_flymad_rosbag_copy/rosbagOut_2014-09-27-14-53-54.bag' if 1: #COMPILE_FOLDERS == False: baglist = [] for bag in glob.glob(BAGS + '/*.bag'): bagtimestamp = parse_bagtime(bag) baglist.append((bag, bagtimestamp)) bagframe = DataFrame(baglist, columns=['Filepath', 'Timestamp']) bagframe.index = pd.to_datetime(bagframe['Timestamp']) bagframe = bagframe.sort() bagframe.to_csv(BAGS + '/list_of_bags.csv', sep=',') if not os.path.exists(JAABA + 'JAR') == True: print "MAKING A JAR" os.makedirs(JAABA + 'JAR') if not os.path.exists(JAABA + 'TRACES') == True: os.makedirs(JAABA + 'TRACES') updated = False for directory in glob.glob(JAABA + '*' + HANDLE + '*' + '*zoom*'): FLY_ID, FMF_TIME, GROUP = parse_fmftime(directory) if not os.path.exists(JAABA + 'JAR/' + FLY_ID + '_' + binsize + '_fly.pickle') == True:
def _flex_binary_moment(arg1, arg2, f, pairwise=False): if not (isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame))): raise TypeError("arguments to moment function must be of type " "np.ndarray/Series/DataFrame") if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( arg2, (np.ndarray, ABCSeries)): X, Y = _prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, ABCDataFrame): from pandas import DataFrame def dataframe_from_int_dict(data, frame_template): result = DataFrame(data, index=frame_template.index) if len(result.columns) > 0: result.columns = frame_template.columns[result.columns] return result results = {} if isinstance(arg2, ABCDataFrame): if pairwise is False: if arg1 is arg2: # special case in order to handle duplicate column names for i, col in enumerate(arg1.columns): results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) return dataframe_from_int_dict(results, arg1) else: if not arg1.columns.is_unique: raise ValueError("'arg1' columns are not unique") if not arg2.columns.is_unique: raise ValueError("'arg2' columns are not unique") with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) X, Y = arg1.align(arg2, join="outer") X = X + 0 * Y Y = Y + 0 * X with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) for i, k1 in enumerate(arg1.columns): for j, k2 in enumerate(arg2.columns): if j < i and arg2 is arg1: # Symmetric case results[i][j] = results[j][i] else: results[i][j] = f( *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])) from pandas import concat result_index = arg1.index.union(arg2.index) if len(result_index): # construct result frame result = concat( [ concat( [ results[i][j] for j, c in enumerate(arg2.columns) ], ignore_index=True, ) for i, c in enumerate(arg1.columns) ], ignore_index=True, axis=1, ) result.columns = arg1.columns # set the index and reorder if arg2.columns.nlevels > 1: result.index = MultiIndex.from_product( arg2.columns.levels + [result_index]) result = result.reorder_levels([2, 0, 1]).sort_index() else: result.index = MultiIndex.from_product([ range(len(arg2.columns)), range(len(result_index)) ]) result = result.swaplevel(1, 0).sort_index() result.index = MultiIndex.from_product([result_index] + [arg2.columns]) else: # empty result result = DataFrame( index=MultiIndex(levels=[arg1.index, arg2.columns], codes=[[], []]), columns=arg2.columns, dtype="float64", ) # reset our index names to arg1 names # reset our column names to arg2 names # careful not to mutate the original names result.columns = result.columns.set_names(arg1.columns.names) result.index = result.index.set_names(result_index.names + arg2.columns.names) return result else: raise ValueError("'pairwise' is not True/False") else: results = { i: f(*_prep_binary(arg1.iloc[:, i], arg2)) for i, col in enumerate(arg1.columns) } return dataframe_from_int_dict(results, arg1) else: return _flex_binary_moment(arg2, arg1, f)
def test_ix_loc_consistency(self): # GH 8613 # some edge cases where ix/loc should return the same # this is not an exhaustive case def compare(result, expected): if is_scalar(expected): assert result == expected else: assert expected.equals(result) # failure cases for .loc, but these work for .ix df = DataFrame(np.random.randn(5, 4), columns=list('ABCD')) for key in [slice(1, 3), tuple([slice(0, 2), slice(0, 2)]), tuple([slice(0, 2), df.columns[0:2]])]: for index in [tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeDateIndex, tm.makePeriodIndex, tm.makeTimedeltaIndex]: df.index = index(len(df.index)) with catch_warnings(record=True): df.ix[key] msg = (r"cannot do slice indexing" r" on {klass} with these indexers \[(0|1)\] of" r" {kind}" .format(klass=type(df.index), kind=str(int))) with pytest.raises(TypeError, match=msg): df.loc[key] df = DataFrame(np.random.randn(5, 4), columns=list('ABCD'), index=pd.date_range('2012-01-01', periods=5)) for key in ['2012-01-03', '2012-01-31', slice('2012-01-03', '2012-01-03'), slice('2012-01-03', '2012-01-04'), slice('2012-01-03', '2012-01-06', 2), slice('2012-01-03', '2012-01-31'), tuple([[True, True, True, False, True]]), ]: # getitem # if the expected raises, then compare the exceptions try: with catch_warnings(record=True): expected = df.ix[key] except KeyError: with pytest.raises(KeyError, match=r"^'2012-01-31'$"): df.loc[key] continue result = df.loc[key] compare(result, expected) # setitem df1 = df.copy() df2 = df.copy() with catch_warnings(record=True): df1.ix[key] = 10 df2.loc[key] = 10 compare(df2, df1) # edge cases s = Series([1, 2, 3, 4], index=list('abde')) result1 = s['a':'c'] with catch_warnings(record=True): result2 = s.ix['a':'c'] result3 = s.loc['a':'c'] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) # now work rather than raising KeyError s = Series(range(5), [-2, -1, 1, 2, 3]) with catch_warnings(record=True): result1 = s.ix[-10:3] result2 = s.loc[-10:3] tm.assert_series_equal(result1, result2) with catch_warnings(record=True): result1 = s.ix[0:3] result2 = s.loc[0:3] tm.assert_series_equal(result1, result2)
def test_timegrouper_with_reg_groups(self): # GH 3794 # allow combination of timegrouper/reg groups df_original = DataFrame( { "Branch": "A A A A A A A B".split(), "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], "Date": [ datetime(2013, 1, 1, 13, 0), datetime(2013, 1, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 12, 2, 12, 0), datetime(2013, 12, 2, 14, 0), ], } ).set_index("Date") df_sorted = df_original.sort_values(by="Quantity", ascending=False) for df in [df_original, df_sorted]: expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), "Quantity": [10, 18, 3], "Date": [ datetime(2013, 12, 31, 0, 0), datetime(2013, 12, 31, 0, 0), datetime(2013, 12, 31, 0, 0), ], } ).set_index(["Date", "Buyer"]) result = df.groupby([Grouper(freq="A"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) expected = DataFrame( { "Buyer": "Carl Mark Carl Joe".split(), "Quantity": [1, 3, 9, 18], "Date": [ datetime(2013, 1, 1, 0, 0), datetime(2013, 1, 1, 0, 0), datetime(2013, 7, 1, 0, 0), datetime(2013, 7, 1, 0, 0), ], } ).set_index(["Date", "Buyer"]) result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) df_original = DataFrame( { "Branch": "A A A A A A A B".split(), "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], "Date": [ datetime(2013, 10, 1, 13, 0), datetime(2013, 10, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 2, 12, 0), datetime(2013, 10, 2, 14, 0), ], } ).set_index("Date") df_sorted = df_original.sort_values(by="Quantity", ascending=False) for df in [df_original, df_sorted]: expected = DataFrame( { "Buyer": "Carl Joe Mark Carl Joe".split(), "Quantity": [6, 8, 3, 4, 10], "Date": [ datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 2, 0, 0), datetime(2013, 10, 2, 0, 0), ], } ).set_index(["Date", "Buyer"]) result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum() expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), "Quantity": [10, 18, 3], "Date": [ datetime(2013, 10, 31, 0, 0), datetime(2013, 10, 31, 0, 0), datetime(2013, 10, 31, 0, 0), ], } ).set_index(["Date", "Buyer"]) tm.assert_frame_equal(result, expected) # passing the name df = df.reset_index() result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): df.groupby([Grouper(freq="1M", key="foo"), "Buyer"]).sum() # passing the level df = df.set_index("Date") result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum() tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): df.groupby([Grouper(freq="1M", level="foo"), "Buyer"]).sum() # multi names df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), "Quantity": [10, 18, 3], "Date": [ datetime(2013, 11, 30, 0, 0), datetime(2013, 11, 30, 0, 0), datetime(2013, 11, 30, 0, 0), ], } ).set_index(["Date", "Buyer"]) tm.assert_frame_equal(result, expected) # error as we have both a level and a name! msg = "The Grouper cannot specify both a key and a level!" with pytest.raises(ValueError, match=msg): df.groupby( [Grouper(freq="1M", key="Date", level="Date"), "Buyer"] ).sum() # single groupers expected = DataFrame( [[31]], columns=["Quantity"], index=DatetimeIndex( [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date" ), ) result = df.groupby(Grouper(freq="1M")).sum() tm.assert_frame_equal(result, expected) result = df.groupby([Grouper(freq="1M")]).sum() tm.assert_frame_equal(result, expected) expected.index = expected.index.shift(1) assert expected.index.freq == offsets.MonthEnd() result = df.groupby(Grouper(freq="1M", key="Date")).sum() tm.assert_frame_equal(result, expected) result = df.groupby([Grouper(freq="1M", key="Date")]).sum() tm.assert_frame_equal(result, expected)
data.replace(-999, np.nan) data.replace([-999, -1000], np.nan) data.replace([-999, -1000], [np.nan, 0]) data.replace({-999: np.nan, -1000: 0}) ###重命名轴索引 data = DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four']) data.index.map(str.upper) data.index = data.index.map(str.upper) data data.rename(index=str.title, columns=str.upper) data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'}) # 总是返回DataFrame的引用 _ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True) data ###离散化与面元划分 #1 ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] bins = [18, 25, 35, 60, 100]
def test_to_frame(): tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] index = MultiIndex.from_tuples(tuples) result = index.to_frame(index=False) expected = DataFrame(tuples) tm.assert_frame_equal(result, expected) result = index.to_frame() expected.index = index tm.assert_frame_equal(result, expected) tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] index = MultiIndex.from_tuples(tuples, names=['first', 'second']) result = index.to_frame(index=False) expected = DataFrame(tuples) expected.columns = ['first', 'second'] tm.assert_frame_equal(result, expected) result = index.to_frame() expected.index = index tm.assert_frame_equal(result, expected) # See GH-22580 index = MultiIndex.from_tuples(tuples) result = index.to_frame(index=False, name=['first', 'second']) expected = DataFrame(tuples) expected.columns = ['first', 'second'] tm.assert_frame_equal(result, expected) result = index.to_frame(name=['first', 'second']) expected.index = index expected.columns = ['first', 'second'] tm.assert_frame_equal(result, expected) msg = "'name' must be a list / sequence of column names." with tm.assert_raises_regex(TypeError, msg): index.to_frame(name='first') msg = "'name' should have same length as number of levels on index." with tm.assert_raises_regex(ValueError, msg): index.to_frame(name=['first']) # Tests for datetime index index = MultiIndex.from_product( [range(5), pd.date_range('20130101', periods=3)]) result = index.to_frame(index=False) expected = DataFrame({ 0: np.repeat(np.arange(5, dtype='int64'), 3), 1: np.tile(pd.date_range('20130101', periods=3), 5) }) tm.assert_frame_equal(result, expected) result = index.to_frame() expected.index = index tm.assert_frame_equal(result, expected) # See GH-22580 result = index.to_frame(index=False, name=['first', 'second']) expected = DataFrame({ 'first': np.repeat(np.arange(5, dtype='int64'), 3), 'second': np.tile(pd.date_range('20130101', periods=3), 5) }) tm.assert_frame_equal(result, expected) result = index.to_frame(name=['first', 'second']) expected.index = index tm.assert_frame_equal(result, expected)
def table(self, from_date=None, to_date=None): ''' Returns the league table with the global basic information. ''' played = self.played_matches(from_date, to_date) points = self.points(from_date, to_date) points = Series(points["Points"].tolist(), index=points["Team"].tolist()) matches = self.matches(from_date, to_date) home_grouped = matches.groupby( ["HomeTeam"]).apply(lambda df_group: len(df_group[df_group[ "FTHG"] > df_group["FTAG"]])) away_grouped = matches.groupby( ["AwayTeam"]).apply(lambda df_group: len(df_group[df_group[ "FTAG"] > df_group["FTHG"]])) won = home_grouped + away_grouped home_grouped = matches.groupby( ["HomeTeam"]).apply(lambda df_group: len(df_group[df_group[ "FTHG"] == df_group["FTAG"]])) away_grouped = matches.groupby( ["AwayTeam"]).apply(lambda df_group: len(df_group[df_group[ "FTAG"] == df_group["FTHG"]])) draw = home_grouped + away_grouped home_grouped = matches.groupby( ["HomeTeam"]).apply(lambda df_group: len(df_group[df_group[ "FTHG"] < df_group["FTAG"]])) away_grouped = matches.groupby( ["AwayTeam"]).apply(lambda df_group: len(df_group[df_group[ "FTAG"] < df_group["FTHG"]])) lost = home_grouped + away_grouped home_grouped = matches.groupby( ["HomeTeam"]).apply(lambda df_group: sum(df_group["FTHG"])) away_grouped = matches.groupby( ["AwayTeam"]).apply(lambda df_group: sum(df_group["FTAG"])) goals_for = home_grouped + away_grouped home_grouped = matches.groupby( ["HomeTeam"]).apply(lambda df_group: sum(df_group["FTAG"])) away_grouped = matches.groupby( ["AwayTeam"]).apply(lambda df_group: sum(df_group["FTHG"])) goals_aga = home_grouped + away_grouped table = DataFrame( dict(Points=points, Played=played, Won=won, Draw=draw, Lost=lost, GF=goals_for, GA=goals_aga, GD=goals_for - goals_aga)).reset_index() table = table.rename(columns={"index": "Team"}) table = table[[ "Points", "Team", "Played", "Won", "Draw", "Lost", "GF", "GA", "GD" ]].sort_values(by=["Points", "GD"], ascending=False).reset_index() table.index = range(1, len(table) + 1) del table["index"] return table
def reformat_index(x: pd.DataFrame): x.index = [str(a) for a in x.index] x.index.name = "moneyness_cut" x.columns = [str(a) for a in x.columns] x.columns.name = "time_cut"
def prepare_data(wt_organ_vol: pd.DataFrame, wt_staging: pd.DataFrame, mut_organ_vol: pd.DataFrame, mut_staging: pd.DataFrame, label_meta: Path = None, normalise_to_whole_embryo=False, qc_file: Path = None) -> pd.DataFrame: """ Merge the mutant and wildtype dtaframes Optionally normalise to staging metric (Usually whole embryo volume) Optionally remove any qc-flagged organs (These will be set to 'nan') Returns ------- Dataframe with following columns: - a column for each label (prefixed with 'x' as statsmodels does not like integer ids) - line - genotype (baseline or mutant) - staging (whole embryo volume) """ wt_staging.rename(columns={'value': 'staging'}, inplace=True) mut_staging.rename(columns={'value': 'staging'}, inplace=True) wt_staging.index = wt_staging.index.astype(str) # Ensure all indxes are same type for d in [wt_organ_vol, mut_organ_vol, wt_staging, mut_staging]: d.index = d.index.astype(str) if normalise_to_whole_embryo: wt_organ_vol = wt_organ_vol.divide(wt_staging['staging'], axis=0) mut_organ_vol = mut_organ_vol.divide(mut_staging['staging'], axis=0) logging.info('Normalising organ volume to whole embryo volume') # merge the organ vol organ_vols = pd.concat([wt_organ_vol, mut_organ_vol]) # Drop any organ columns that has only zero values. These are the gaps in the label map caused by merging labels # in the atlas organ_vols = organ_vols.loc[:, (organ_vols != 0).any(axis=0)] # For the statsmodels linear mode to work, column names cannot start with a digit. Prefix with 'x' organ_vols.columns = [ f'x{x}' if x.isdigit() else x for x in organ_vols.columns ] staging = pd.concat([wt_staging, mut_staging]) # Merge staging to the organvolume dataframe. First drop line so we don't get duplicate entries # staging.drop(columns=['line'], inplace=True) data = pd.concat([organ_vols, staging], axis=1) # Filter any labels that have been flagged at the label-level (for all specimens) if label_meta: label_meta = pd.read_csv(label_meta, index_col=0) if 'no_analysis' in label_meta: # If we have a no_analysis column, drop labels that are flagged flagged_lables = label_meta[label_meta.no_analysis == True].index data.drop( columns=[f'x{x}' for x in flagged_lables if f'x{x}' in data], inplace=True) # QC-flagged organs from specimens specified in QC file are set to None if qc_file: logging.info(f'Excluding organ volumes specified in: {qc_file}') qc = pd.read_csv(qc_file) for _, row in qc.iterrows(): qc_id = str(row.id) if qc_id not in data.index: raise LamaDataException( f'QC flagged specimen {row.id} does not exist in dataset') if f'x{row.label}' not in data: raise LamaDataException( f'QC flagegd label, {row.label}, does not exist in dataset' ) data.loc[qc_id, f'x{row.label}'] = None return data
def test_basic(): cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True, ) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True) expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 x = DataFrame( [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], columns=["person_id", "person_name"], ) x["person_name"] = Categorical(x.person_name) g = x.groupby(["person_id"], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[["person_name"]]) result = x.drop_duplicates("person_name") expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): return x.drop_duplicates("person_name").iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") expected["person_name"] = expected["person_name"].astype("object") tm.assert_frame_equal(result, expected) # GH 9921 # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[["a"]]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[["a"]]) # Filter tm.assert_series_equal( df.a.groupby(c, observed=False).filter(np.all), df["a"]) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[["a"]]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]]) # GH 9603 df = DataFrame({"a": [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd"))) result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) expected.index.name = "a" tm.assert_series_equal(result, expected) # more basic levels = ["foo", "bar", "baz", "qux"] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) tm.assert_frame_equal(result, expected) grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"]) expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe() tm.assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
def test_sort_datetimelike(): # GH10505 # use same data as test_groupby_sort_categorical, which category is # corresponding to datetime.month df = DataFrame( { "dt": [ datetime(2011, 7, 1), datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 2, 1), datetime(2011, 1, 1), datetime(2011, 5, 1), ], "foo": [10, 8, 5, 6, 4, 1, 7], "bar": [10, 20, 30, 40, 50, 60, 70], }, columns=["dt", "foo", "bar"], ) # ordered=True df["dt"] = Categorical(df["dt"], ordered=True) index = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1), ] result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"]) result_sort.index = CategoricalIndex(index, name="dt", ordered=True) index = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1), ] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"]) result_nosort.index = CategoricalIndex(index, categories=index, name="dt", ordered=True) col = "dt" tm.assert_frame_equal(result_sort, df.groupby(col, sort=True, observed=False).first()) # when categories is ordered, group is ordered by category's order tm.assert_frame_equal(result_sort, df.groupby(col, sort=False, observed=False).first()) # ordered = False df["dt"] = Categorical(df["dt"], ordered=False) index = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1), ] result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"]) result_sort.index = CategoricalIndex(index, name="dt") index = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1), ] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"]) result_nosort.index = CategoricalIndex(index, categories=index, name="dt") col = "dt" tm.assert_frame_equal(result_sort, df.groupby(col, sort=True, observed=False).first()) tm.assert_frame_equal(result_nosort, df.groupby(col, sort=False, observed=False).first())
def test_convert_dti_to_series(self): # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 idx = DatetimeIndex(to_datetime(["2013-1-1 13:00", "2013-1-2 14:00"]), name="B").tz_localize("US/Pacific") df = DataFrame(np.random.randn(2, 1), columns=["A"]) expected = Series( np.array( [ Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ], dtype="object", ), name="B", ) # convert index to series result = Series(idx) tm.assert_series_equal(result, expected) # assign to frame df["B"] = idx result = df["B"] tm.assert_series_equal(result, expected) # convert to series while keeping the timezone msg = "stop passing 'keep_tz'" with tm.assert_produces_warning(FutureWarning) as m: result = idx.to_series(keep_tz=True, index=[0, 1]) tm.assert_series_equal(result, expected) assert msg in str(m[0].message) # convert to utc with tm.assert_produces_warning(FutureWarning) as m: df["B"] = idx.to_series(keep_tz=False, index=[0, 1]) result = df["B"] comp = Series(DatetimeIndex(expected.values).tz_localize(None), name="B") tm.assert_series_equal(result, comp) msg = "do 'idx.tz_convert(None)' before calling" assert msg in str(m[0].message) result = idx.to_series(index=[0, 1]) tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning) as m: result = idx.to_series(keep_tz=False, index=[0, 1]) tm.assert_series_equal(result, expected.dt.tz_convert(None)) msg = "do 'idx.tz_convert(None)' before calling" assert msg in str(m[0].message) # list of datetimes with a tz df["B"] = idx.to_pydatetime() result = df["B"] tm.assert_series_equal(result, expected) # GH 6785 # set the index manually import pytz df = DataFrame([{ "ts": datetime(2014, 4, 1, tzinfo=pytz.utc), "foo": 1 }]) expected = df.set_index("ts") df.index = df["ts"] df.pop("ts") tm.assert_frame_equal(df, expected)
def test_read_excel_multiindex(self, read_ext): # see gh-4679 if pd.read_excel.keywords["engine"] == "pyxlsb": pytest.xfail("Sheets containing datetimes not supported by pyxlsb") mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext # "mi_column" sheet expected = DataFrame( [ [1, 2.5, pd.Timestamp("2015-01-01"), True], [2, 3.5, pd.Timestamp("2015-01-02"), False], [3, 4.5, pd.Timestamp("2015-01-03"), False], [4, 5.5, pd.Timestamp("2015-01-04"), True], ], columns=mi, ) actual = pd.read_excel( mi_file, sheet_name="mi_column", header=[0, 1], index_col=0 ) tm.assert_frame_equal(actual, expected) # "mi_index" sheet expected.index = mi expected.columns = ["a", "b", "c", "d"] actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) # "both" sheet expected.columns = mi actual = pd.read_excel( mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1] ) tm.assert_frame_equal(actual, expected, check_names=False) # "mi_index_name" sheet expected.columns = ["a", "b", "c", "d"] expected.index = mi.set_names(["ilvl1", "ilvl2"]) actual = pd.read_excel(mi_file, sheet_name="mi_index_name", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # "mi_column_name" sheet expected.index = list(range(4)) expected.columns = mi.set_names(["c1", "c2"]) actual = pd.read_excel( mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0 ) tm.assert_frame_equal(actual, expected) # see gh-11317 # "name_with_int" sheet expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"]) actual = pd.read_excel( mi_file, sheet_name="name_with_int", index_col=0, header=[0, 1] ) tm.assert_frame_equal(actual, expected) # "both_name" sheet expected.columns = mi.set_names(["c1", "c2"]) expected.index = mi.set_names(["ilvl1", "ilvl2"]) actual = pd.read_excel( mi_file, sheet_name="both_name", index_col=[0, 1], header=[0, 1] ) tm.assert_frame_equal(actual, expected) # "both_skiprows" sheet actual = pd.read_excel( mi_file, sheet_name="both_name_skiprows", index_col=[0, 1], header=[0, 1], skiprows=2, ) tm.assert_frame_equal(actual, expected)
def test_apply_multi_index(self): s = DataFrame([[1, 2], [3, 4], [5, 6]]) s.index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) s.columns = ['col1', 'col2'] res = s.apply(lambda x: Series({'min': min(x), 'max': max(x)}), 1) assert isinstance(res.index, MultiIndex)
def bdh(self, ticker_list, fld_list, start_date, end_date=date.today().strftime('%Y%m%d'), periodselection='DAILY', overrides=None): """ Get ticker_list and field_list return pandas multi level columns dataframe """ # Create and fill the request for the historical data self.service_refData() if isstring(ticker_list): ticker_list = [ticker_list] if isstring(fld_list): fld_list = [fld_list] if hasattr(start_date, 'strftime'): start_date = start_date.strftime('%Y%m%d') if hasattr(end_date, 'strftime'): end_date = end_date.strftime('%Y%m%d') request = self.refDataService.createRequest("HistoricalDataRequest") for t in ticker_list: request.getElement("securities").appendValue(t) for f in fld_list: request.getElement("fields").appendValue(f) request.set("periodicityAdjustment", "ACTUAL") request.set("periodicitySelection", periodselection) request.set("startDate", start_date) request.set("endDate", end_date) if overrides is not None: overrideOuter = request.getElement('overrides') for k in overrides: override1 = overrideOuter.appendElement() override1.setElement('fieldId', k) override1.setElement('value', overrides[k]) #print("Sending Request:", request) # Send the request self.session.sendRequest(request) # defaultdict - later convert to pandas data = defaultdict(dict) # Process received events while (True): # We provide timeout to give the chance for Ctrl+C handling: ev = self.session.nextEvent(500) for msg in ev: ticker = msg.getElement('securityData').getElement( 'security').getValue() fieldData = msg.getElement('securityData').getElement( 'fieldData') for i in range(fieldData.numValues()): for j in range(1, fieldData.getValue(i).numElements()): data[(ticker, fld_list[j - 1])][fieldData.getValue( i).getElement(0).getValue()] = fieldData.getValue( i).getElement(j).getValue() if ev.eventType() == blpapi.Event.RESPONSE: # Response completly received, so we could exit break if len(fld_list) == 1: data = {k[0]: v for k, v in data.items()} data = DataFrame(data) #data.index = pd.to_datetime(data.index) return data if len(data) == 0: # security error case return DataFrame() data = DataFrame(data) data.columns = pd.MultiIndex.from_tuples(data, names=['ticker', 'field']) data.index = pd.to_datetime(data.index) return data
def _normalize( table: DataFrame, normalize, margins: bool, margins_name="All" ) -> DataFrame: if not isinstance(normalize, (bool, str)): axis_subs = {0: "index", 1: "columns"} try: normalize = axis_subs[normalize] except KeyError as err: raise ValueError("Not a valid normalize argument") from err if margins is False: # Actual Normalizations normalizers: dict[bool | str, Callable] = { "all": lambda x: x / x.sum(axis=1).sum(axis=0), "columns": lambda x: x / x.sum(), "index": lambda x: x.div(x.sum(axis=1), axis=0), } normalizers[True] = normalizers["all"] try: f = normalizers[normalize] except KeyError as err: raise ValueError("Not a valid normalize argument") from err table = f(table) table = table.fillna(0) elif margins is True: # keep index and column of pivoted table table_index = table.index table_columns = table.columns last_ind_or_col = table.iloc[-1, :].name # check if margin name is not in (for MI cases) and not equal to last # index/column and save the column and index margin if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col): raise ValueError(f"{margins_name} not in pivoted DataFrame") column_margin = table.iloc[:-1, -1] index_margin = table.iloc[-1, :-1] # keep the core table table = table.iloc[:-1, :-1] # Normalize core table = _normalize(table, normalize=normalize, margins=False) # Fix Margins if normalize == "columns": column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) table.columns = table_columns elif normalize == "index": index_margin = index_margin / index_margin.sum() table = table._append(index_margin) table = table.fillna(0) table.index = table_index elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() index_margin = index_margin / index_margin.sum() index_margin.loc[margins_name] = 1 table = concat([table, column_margin], axis=1) table = table._append(index_margin) table = table.fillna(0) table.index = table_index table.columns = table_columns else: raise ValueError("Not a valid normalize argument") else: raise ValueError("Not a valid margins argument") return table
def test_set_index_datetime(self): # GH#3950 df = DataFrame({ "label": ["a", "a", "a", "b", "b", "b"], "datetime": [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", ], "value": range(6), }) df.index = to_datetime(df.pop("datetime"), utc=True) df.index = df.index.tz_convert("US/Pacific") expected = DatetimeIndex( [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00" ], name="datetime", ) expected = expected.tz_localize("UTC").tz_convert("US/Pacific") df = df.set_index("label", append=True) tm.assert_index_equal(df.index.levels[0], expected) tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) assert df.index.names == ["datetime", "label"] df = df.swaplevel(0, 1) tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) tm.assert_index_equal(df.index.levels[1], expected) assert df.index.names == ["label", "datetime"] df = DataFrame(np.random.random(6)) idx1 = DatetimeIndex( [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", ], tz="US/Eastern", ) idx2 = DatetimeIndex( [ "2012-04-01 09:00", "2012-04-01 09:00", "2012-04-01 09:00", "2012-04-02 09:00", "2012-04-02 09:00", "2012-04-02 09:00", ], tz="US/Eastern", ) idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo") idx3 = idx3._with_freq(None) df = df.set_index(idx1) df = df.set_index(idx2, append=True) df = df.set_index(idx3, append=True) expected1 = DatetimeIndex( [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00" ], tz="US/Eastern", ) expected2 = DatetimeIndex(["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern") tm.assert_index_equal(df.index.levels[0], expected1) tm.assert_index_equal(df.index.levels[1], expected2) tm.assert_index_equal(df.index.levels[2], idx3) # GH#7092 tm.assert_index_equal(df.index.get_level_values(0), idx1) tm.assert_index_equal(df.index.get_level_values(1), idx2) tm.assert_index_equal(df.index.get_level_values(2), idx3)
def _generate_marginal_results(table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All"): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): return (key, margins_name) + ("", ) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby( rows, observed=observed).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): all_key = _all_key(key) # we are going to mutate this, so need to copy! piece = piece.copy() piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: from pandas import DataFrame cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): if len(cols) > 1: all_key = _all_key(key) else: all_key = margins_name table_pieces.append(piece) # GH31016 this is to calculate margin for each group, and assign # corresponded key as index transformed_piece = DataFrame(piece.apply(aggfunc)).T transformed_piece.index = Index([all_key], name=piece.index.name) # append piece for margin into table_piece table_pieces.append(transformed_piece) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) if len(rows) == 0: return result else: result = table margin_keys = table.columns if len(cols) > 0: row_margin = data[cols + values].groupby( cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + list(range(len(cols))) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) return result, margin_keys, row_margin
def return_risk_analysis_old(nav_df: pd.DataFrame, date_frm=None, date_to=None, freq='weekly', rf=0.02): """ 按列统计 rr_df 收益率绩效 :param nav_df: 收益率DataFrame,index为日期,每一列为一个产品的净值走势 :param date_frm: 统计日期区间,可以为空 :param date_to: 统计日期区间,可以为空 :param freq: None 自动识别, 'daily' 'weekly' 'monthly' :param rf: 无风险收益率,默认 0.02 :return: """ nav_df.index = [try_2_date(idx) for idx in nav_df.index] nav_sorted_df = nav_df.sort_index() rr_df = (1 + nav_sorted_df.pct_change().fillna(0)).cumprod() rr_df.index = [try_2_date(d) for d in rr_df.index] # 计算数据实际频率是日频、周频、月頻 rr_df_len = rr_df.shape[0] day_per_data = (rr_df.index[rr_df_len - 1] - rr_df.index[0]).days / rr_df_len if day_per_data <= 0.005: freq_real = 'minute' elif day_per_data <= 0.2: freq_real = 'hour' elif day_per_data <= 2: freq_real = 'daily' elif day_per_data <= 10: freq_real = 'weekly' else: freq_real = 'monthly' if freq is None: freq = freq_real elif freq != freq_real: warnings_msg = "data freq wrong, expect %s, but %s was detected" % ( freq, freq_real) # warnings.warn(warnings_msg) # logging.warning(warnings_msg) raise ValueError(warnings_msg) freq_str = '' if freq == 'weekly': data_count_per_year = 50 freq_str = '周' elif freq == 'monthly': data_count_per_year = 12 freq_str = '月' elif freq == 'daily': data_count_per_year = 250 freq_str = '日' elif freq == 'hour': data_count_per_year = 1250 freq_str = '时' elif freq == 'minute': data_count_per_year = 75000 freq_str = '分' else: raise ValueError('freq=%s 只接受 daily weekly monthly 三种之一', freq) stat_dic_dic = OrderedDict() # rr_df.index = [str_2_date(d) for d in rr_df.index] rr_uindex_df = rr_df.reset_index() col_name_list = list(rr_uindex_df.columns) date_col_name = col_name_list[0] col_name_list = col_name_list[1:] if type(date_frm) is str: date_frm = datetime.strptime(date_frm, '%Y-%m-%d').date() if type(date_to) is str: date_to = datetime.strptime(date_to, '%Y-%m-%d').date() for col_name in col_name_list: data_df = rr_uindex_df[[date_col_name, col_name]] # print(data_df) data_df.columns = ['Date', 'Value'] data_df = get_df_between_date(data_df, date_frm, date_to) data_df.Value = data_df.Value / data_df.Value[0] data_df['ret'] = data_df.Value.pct_change().fillna(0) date_span = data_df.Date[data_df.index[-1]] - data_df.Date[ data_df.index[0]] date_span_fraction = 365 / date_span.days if date_span.days > 0 else 1 # basic indicators CAGR = data_df.Value[data_df.index[-1]]**date_span_fraction - 1 period_rr = data_df.Value[data_df.index[-1]] - 1 ann_vol = np.std(data_df.ret, ddof=1) * np.sqrt(data_count_per_year) down_side_vol = np.std(data_df.ret[data_df.ret < 0], ddof=1) * np.sqrt(data_count_per_year) # WeeksNum = data.shape[0] profit_loss_ratio = -np.mean(data_df.ret[data_df.ret > 0]) / np.mean( data_df.ret[data_df.ret < 0]) win_ratio = len(data_df.ret[data_df.ret >= 0]) / len(data_df.ret) min_value = min(data_df.Value) final_value = data_df.Value[data_df.index[-1]] max_ret = max(data_df.ret) min_ret = min(data_df.ret) # End of basic indicators # max dropdown related data_df['mdd'] = data_df.Value / data_df.Value.cummax() - 1 mdd_size = min(data_df.mdd) droparray = pd.Series(data_df.index[data_df.mdd == 0]) if len(droparray) == 1: mdd_max_period = len(data_df.mdd) else: if float(data_df.Value[droparray.tail(1)]) > float( data_df.Value.tail(1)): droparray = droparray.append(pd.Series(data_df.index[-1]), ignore_index=True) mdd_max_period = max(droparray.diff().dropna()) - 1 # End of max dropdown related # High level indicators sharpe_ratio = (CAGR - rf) / ann_vol sortino_ratio = (CAGR - rf) / down_side_vol calmar_ratio = CAGR / (-mdd_size) # Natural month return j = 1 for i in data_df.index: if i == 0: month_ret = pd.DataFrame([[data_df.Date[i], data_df.Value[i]]], columns=('Date', 'Value')) else: if data_df.Date[i].month != data_df.Date[i - 1].month: month_ret.loc[j] = [ data_df.Date[i - 1], data_df.Value[i - 1] ] j += 1 month_ret.loc[j] = [ data_df.Date[data_df.index[-1]], data_df.Value[data_df.index[-1]] ] month_ret['ret'] = month_ret.Value.pct_change().fillna(0) max_rr_month = max(month_ret.ret) min_rr_month = min(month_ret.ret) # End of Natural month return data_len = data_df.shape[0] date_begin = data_df.Date[0] # .date() date_end = data_df.Date[data_len - 1] stat_dic = OrderedDict([('起始日期', date_begin), ('截止日期', date_end), ('区间收益率', '%.2f%%' % (period_rr * 100)), ('最终净值', '%.4f' % final_value), ('最低净值', '%.4f' % min_value), ('年化收益率', '%.2f%%' % (CAGR * 100)), ('年化波动率', '%.2f%%' % (ann_vol * 100)), ('年化下行波动率', '%.2f%%' % (down_side_vol * 100)), ('最大回撤', '%.2f%%' % (mdd_size * 100)), ('夏普率', '%.2f' % sharpe_ratio), ('索提诺比率', '%.2f' % sortino_ratio), ('卡马比率', '%.2f' % calmar_ratio), ('盈亏比', '%.2f' % profit_loss_ratio), ('胜率', '%.2f' % win_ratio), ('最长不创新高(%s)' % freq_str, mdd_max_period), ('统计周期最大收益', '%.2f%%' % (max_ret * 100)), ('统计周期最大亏损', '%.2f%%' % (min_ret * 100)), ('最大月收益', '%.2f%%' % (max_rr_month * 100)), ('最大月亏损', '%.2f%%' % (min_rr_month * 100))]) stat_dic_dic[col_name] = stat_dic stat_df = pd.DataFrame(stat_dic_dic) stat_df = stat_df.ix[list(stat_dic.keys())] return stat_df
def validate( self, check_obj: pd.DataFrame, head: Optional[int] = None, tail: Optional[int] = None, sample: Optional[int] = None, random_state: Optional[int] = None, lazy: bool = False, ) -> pd.DataFrame: # pylint: disable=too-many-locals,too-many-branches """Check if all columns in a dataframe have a column in the Schema. :param pd.DataFrame dataframe: the dataframe to be validated. :param head: validate the first n rows. Rows overlapping with `tail` or `sample` are de-duplicated. :param tail: validate the last n rows. Rows overlapping with `head` or `sample` are de-duplicated. :param sample: validate a random sample of n rows. Rows overlapping with `head` or `tail` are de-duplicated. :param random_state: random seed for the ``sample`` argument. :param lazy: if True, lazily evaluates dataframe against all validation checks and raises a ``SchemaErrorReport``. Otherwise, raise ``SchemaError`` as soon as one occurs. :returns: validated ``DataFrame`` :raises SchemaError: when ``DataFrame`` violates built-in or custom checks. :example: Calling ``schema.validate`` returns the dataframe. >>> import pandas as pd >>> import pandera as pa >>> >>> df = pd.DataFrame({ ... "probability": [0.1, 0.4, 0.52, 0.23, 0.8, 0.76], ... "category": ["dog", "dog", "cat", "duck", "dog", "dog"] ... }) >>> >>> schema_withchecks = pa.DataFrameSchema({ ... "probability": pa.Column( ... pa.Float, pa.Check(lambda s: (s >= 0) & (s <= 1))), ... ... # check that the "category" column contains a few discrete ... # values, and the majority of the entries are dogs. ... "category": pa.Column( ... pa.String, [ ... pa.Check(lambda s: s.isin(["dog", "cat", "duck"])), ... pa.Check(lambda s: (s == "dog").mean() > 0.5), ... ]), ... }) >>> >>> schema_withchecks.validate(df)[["probability", "category"]] probability category 0 0.10 dog 1 0.40 dog 2 0.52 cat 3 0.23 duck 4 0.80 dog 5 0.76 dog """ if self._is_inferred: warnings.warn( "This %s is an inferred schema that hasn't been " "modified. It's recommended that you refine the schema " "by calling `add_columns`, `remove_columns`, or " "`update_columns` before using it to validate data." % type(self), UserWarning ) error_handler = SchemaErrorHandler(lazy) # dataframe strictness check makes sure all columns in the dataframe # are specified in the dataframe schema if self.strict: # expand regex columns col_regex_matches = [] # type: ignore for colname, col_schema in self.columns.items(): if col_schema.regex: try: col_regex_matches.extend( col_schema.get_regex_columns(check_obj.columns)) except errors.SchemaError: pass expanded_column_names = frozenset( [n for n, c in self.columns.items() if not c.regex] + col_regex_matches ) for column in check_obj: if column not in expanded_column_names: msg = ( "column '%s' not in DataFrameSchema %s" % (column, self.columns) ) error_handler.collect_error( "column_not_in_schema", errors.SchemaError( self, check_obj, msg, failure_cases=scalar_failure_case(column), check="column_in_schema", ) ) # column data-type coercion logic lazy_exclude_columns = [] for colname, col_schema in self.columns.items(): if col_schema.regex: try: matched_columns = col_schema.get_regex_columns( check_obj.columns) except errors.SchemaError: matched_columns = pd.Index([]) for matched_colname in matched_columns: if col_schema.coerce or self.coerce: check_obj[matched_colname] = col_schema.coerce_dtype( check_obj[matched_colname]) elif colname not in check_obj and col_schema.required: if lazy: # exclude columns that are not present in the dataframe # for lazy validation, the error is collected by the # error_handler and should raise a SchemaErrors exception # at the end of the `validate` method. lazy_exclude_columns.append(colname) msg = ( "column '%s' not in dataframe\n%s" % (colname, check_obj.head()) ) error_handler.collect_error( "column_not_in_dataframe", errors.SchemaError( self, check_obj, msg, failure_cases=scalar_failure_case(colname), check="column_in_dataframe", ) ) elif col_schema.coerce or self.coerce: check_obj.loc[:, colname] = col_schema.coerce_dtype( check_obj[colname]) schema_components = [ col for col_name, col in self.columns.items() if (col.required or col_name in check_obj) and col_name not in lazy_exclude_columns ] if self.index is not None: if self.index.coerce or self.coerce: check_obj.index = self.index.coerce_dtype(check_obj.index) schema_components.append(self.index) dataframe_to_validate = self._dataframe_to_validate( check_obj, head, tail, sample, random_state) check_results = [] # schema-component-level checks for schema_component in schema_components: try: check_results.append(isinstance( schema_component(dataframe_to_validate), pd.DataFrame)) except errors.SchemaError as err: error_handler.collect_error("schema_component_check", err) # dataframe-level checks for check_index, check in enumerate(self.checks): try: check_results.append(_handle_check_results( self, check_index, check, dataframe_to_validate)) except errors.SchemaError as err: error_handler.collect_error("dataframe_check", err) if lazy and error_handler.collected_errors: raise errors.SchemaErrors( error_handler.collected_errors, check_obj) assert all(check_results) return check_obj
def test_margin_dropna(self): # GH 12577 # pivot_table counts null into margin ('All') # when margins=true and dropna=true df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = crosstab(df.a, df.b, margins=True, dropna=True) expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) expected.index = Index([1.0, 2.0, "All"], name="a") expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) df = DataFrame({ "a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4] }) actual = crosstab(df.a, df.b, margins=True, dropna=True) expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, "All"], name="a") expected.columns = Index([3.0, 4.0, "All"], name="b") tm.assert_frame_equal(actual, expected) df = DataFrame({ "a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4] }) actual = crosstab(df.a, df.b, margins=True, dropna=True) expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) expected.index = Index([1.0, 2.0, "All"], name="a") expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) # GH 12642 # _add_margins raises KeyError: Level None not found # when margins=True and dropna=False df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = crosstab(df.a, df.b, margins=True, dropna=False) expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) expected.index = Index([1.0, 2.0, "All"], name="a") expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) df = DataFrame({ "a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4] }) actual = crosstab(df.a, df.b, margins=True, dropna=False) expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) expected.index = Index([1.0, 2.0, "All"], name="a") expected.columns = Index([3.0, 4.0, "All"], name="b") tm.assert_frame_equal(actual, expected) a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) c = np.array( ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object) actual = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False) m = MultiIndex.from_arrays( [ ["one", "one", "two", "two", "All"], ["dull", "shiny", "dull", "shiny", ""], ], names=["b", "c"], ) expected = DataFrame( [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m) expected.index = Index(["bar", "foo", "All"], name="a") tm.assert_frame_equal(actual, expected) actual = crosstab([a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False) m = MultiIndex.from_arrays( [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], names=["a", "b"], ) expected = DataFrame( [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m) expected.columns = Index(["dull", "shiny", "All"], name="c") tm.assert_frame_equal(actual, expected) actual = crosstab([a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True) m = MultiIndex.from_arrays( [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], names=["a", "b"], ) expected = DataFrame( [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m) expected.columns = Index(["dull", "shiny", "All"], name="c") tm.assert_frame_equal(actual, expected)