def _show(rows, n, cols): """Printing to a screen or saving to a file rows: iterator of Row instances n: maximum number of lines to show cols: columns to show """ # so that you can easily maintain code # Searching nrows is easier than searching n in editors nrows = n if cols: rows = _pick(cols, rows) row0, rows1 = peek_first(rows) cols = row0.columns seq_values = _safe_values(rows1, cols) with pd.option_context("display.max_rows", nrows), \ pd.option_context("display.max_columns", 1000): # make use of pandas DataFrame displaying # islice 1 more rows than required # to see if there are more rows left list_values = list(islice(seq_values, nrows + 1)) print(pd.DataFrame(list_values[:nrows], columns=cols)) if len(list_values) > nrows: print("...more rows...")
def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) for len_, verbose in [(5, None), (5, False), (10, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context('max_info_columns', 4): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) for len_, verbose in [(10, None), (5, False), (10, True)]: # max_cols no exceeded with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) for len_, max_cols in [(10, 5), (5, 4)]: # setting truncates with option_context('max_info_columns', 4): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) # setting wouldn't truncate with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_)
def test_publishes(self): df = pd.DataFrame({"A": [1, 2]}) objects = [df['A'], df, df] # dataframe / series expected_keys = [ {'text/plain', 'application/vnd.dataresource+json'}, {'text/plain', 'text/html', 'application/vnd.dataresource+json'}, ] make_patch = self.mock.patch('IPython.display.display') opt = pd.option_context('display.html.table_schema', True) for obj, expected in zip(objects, expected_keys): with opt, make_patch as mock_display: handle = obj._ipython_display_() self.assertEqual(mock_display.call_count, 1) self.assertIsNone(handle) args, kwargs = mock_display.call_args arg, = args # just one argument self.assertEqual(kwargs, {"raw": True}) self.assertEqual(set(arg.keys()), expected) with_latex = pd.option_context('display.latex.repr', True) with opt, with_latex, make_patch as mock_display: handle = obj._ipython_display_() args, kwargs = mock_display.call_args arg, = args expected = {'text/plain', 'text/html', 'text/latex', 'application/vnd.dataresource+json'} self.assertEqual(set(arg.keys()), expected)
def test_repr_dimensions(self): df = DataFrame([[1, 2, ], [3, 4]]) with option_context('display.show_dimensions', True): self.assertTrue("2 rows x 2 columns" in repr(df)) with option_context('display.show_dimensions', False): self.assertFalse("2 rows x 2 columns" in repr(df)) with option_context('display.show_dimensions', 'truncate'): self.assertFalse("2 rows x 2 columns" in repr(df))
def ent_ri_upsell_opp(df, include_rds=False): # Varaibles to store Tables and Images tables=[] images=[] # Get todays date today = datetime.today().date() last_month = date(today.year, today.month-1, 1) # Group by AR Period df = df.groupby('AR Period') df = df.get_group(last_month) # Group By Territory by_ter = df.groupby(['Territory']) # Empty DataFrame to Store Overview ec2_overview = pd.DataFrame(columns=('Territory', 'Total Revenue', 'Total EC2', 'OnDemand', 'Av. % Optimised', 'Partial RI Opp.', 'Full RI Opp.')) if include_rds: rds_overview = pd.DataFrame(columns=('Territory', 'Total Revenue', 'Total RDS', 'OnDemand', 'Av. % Optimised', 'Partial RI Opp.', 'Full RI Opp.')) # For each territory for territory, data in by_ter: tbl, img, over = ent_ri_ec2_upsell(data, territory) tables += tbl images += img ec2_overview = ec2_overview.append(over) if include_rds: tbl, img, over = ent_ri_rds_upsell(data, territory) tables += tbl images += img rds_overview = rds_overview.append(over) # Include EC2 Overview Table ec2_overview = ec2_overview[['Territory', 'Total Revenue', 'Total EC2', 'OnDemand', 'Av. % Optimised', 'Partial RI Opp.', 'Full RI Opp.']].sort('Partial RI Opp.', ascending=False) ec2_overview.rename(columns={'Av. % Optimised': 'Av. \% Optimised'}, inplace=True) pretty_overview = ec2_overview.apply(add_commas_df) pretty_overview['Territory'] = pretty_overview['Territory'].apply(add_href) with pd.option_context("max_colwidth", 1000): pretty_overview.to_latex('ent-ec2-ri-upsell-overview.tex', index=False, na_rep="Unknown", escape=False) table= {'name':"\\textbf{Ent EC2 RI Upsell Opportunity Overview (%s)}"%last_month.strftime("%b-%y"),\ 'file':'ent-ec2-ri-upsell-overview','section':'Overview'} tables.append(table) # Include RDS Overview Table if include_rds: rds_overview = rds_overview[['Territory', 'Total Revenue', 'Total RDS', 'OnDemand', 'Av. % Optimised', 'Partial RI Opp.', 'Full RI Opp.']].sort('Partial RI Opp.', ascending=False) rds_overview.rename(columns={'Av. % Optimised': 'Av. \% Optimised'}, inplace=True) pretty_overview = rds_overview.apply(add_commas_df) pretty_overview['Territory'] = pretty_overview['Territory'].apply(add_href) with pd.option_context("max_colwidth", 1000): pretty_overview.to_latex('ent-rds-ri-upsell-overview.tex', index=False, na_rep="Unknown", escape=False) table= {'name':"\\textbf{Ent RDS RI Upsell Opportunity Overview (%s)}"%last_month.strftime("%b-%y"),\ 'file':'ent-rds-ri-upsell-overview','section':'Overview'} tables.append(table) return tables, images
def correlate_operators(operator_so, operator_si, verbose): # inner merge to get linear combinations of contributing correlation functions lattice_operators = pd.merge(operator_so, operator_si, how='inner', left_index=True, right_index=True, suffixes=['_{so}', '_{si}']) lattice_operators['coefficient'] = lattice_operators['coefficient_{so}'].apply(np.conj) \ * lattice_operators['coefficient_{si}'] lattice_operators.drop( ['coefficient_{so}', 'coefficient_{si}'], axis=1, inplace=True) lattice_operators.reset_index(inplace=True) index = lattice_operators.columns.difference(['coefficient']).tolist() order = {r'Irrep': 0, r'mult': 1, r'p_{cm}': 2, r'operator_label_{so}': 3, r'operator_label_{si}': 4, r'\mu': 5, r'\beta': 6, r'q_{so}': 7, r'q_{si}': 8, r'p^{0}_{so}': 9, r'p^{1}_{so}': 10, r'p^{0}_{si}': 11, r'p^{1}_{si}': 12, r'\gamma^{0}_{so}': 13, r'\gamma^{1}_{so}': 14, r'\gamma^{0}_{si}': 15, r'\gamma^{1}_{si}': 16} index = sorted(index, key=lambda x: order[x]) lattice_operators.set_index(index, inplace=True) lattice_operators = lattice_operators.sum(axis=0, level=index) lattice_operators = lattice_operators[lattice_operators['coefficient'] != 0] if verbose >= 1: print 'lattice_operators' if verbose == 1: with pd.option_context('display.max_rows', None, 'display.max_columns', None): print lattice_operators.head() if verbose >= 2: with pd.option_context('display.max_rows', None, 'display.max_columns', None): print lattice_operators return lattice_operators
def test_unicode_print(self): c = Categorical(['aaaaa', 'bb', 'cccc'] * 20) expected = """\ [aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc] Length: 60 Categories (3, object): [aaaaa, bb, cccc]""" assert repr(c) == expected c = Categorical(['ああああ', 'いいいいい', 'ううううううう'] * 20) expected = """\ [ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] Length: 60 Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa assert repr(c) == expected # unicode option should not affect to Categorical, as it doesn't care # the repr width with option_context('display.unicode.east_asian_width', True): c = Categorical(['ああああ', 'いいいいい', 'ううううううう'] * 20) expected = """[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] Length: 60 Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa assert repr(c) == expected
def main(args): df = pd.DataFrame(index=args.directories, columns=["sentences", "tokens", "nodes", "discontinuous", "reentrant", "implicit", "edges", "primary", "remote"]) df.fillna(0, inplace=True) for i, directory in enumerate(args.directories): row = df.loc[directory] for passage in get_passages_with_progress_bar(directory, desc=directory): l1 = passage.layer(layer1.LAYER_ID) non_terminals = [n for n in l1.all if n not in l1.heads and len(n.get_terminals()) > 1] edges = {e for n in non_terminals for e in n} remote_counter = Counter(e.attrib.get("remote", False) for e in edges) row["sentences"] += 1 row["tokens"] += len(passage.layer(layer0.LAYER_ID).all) row["nodes"] += len(non_terminals) row["discontinuous"] += sum(1 for n in non_terminals if n.discontiguous) row["reentrant"] += sum(1 for n in non_terminals if any(e.attrib.get("remote") for e in n.incoming)) row["edges"] += len(edges) row["primary"] += remote_counter[False] row["remote"] += remote_counter[True] row["implicit"] += sum(1 for n in l1.all if n.attrib.get("implicit")) # Change to percentages df["discontinuous"] *= 100. / df["nodes"] df["reentrant"] *= 100. / df["nodes"] df["implicit"] *= 100. / df["nodes"] df["primary"] *= 100. / df["edges"] df["remote"] *= 100. / df["edges"] # Print if args.outfile: df.T.to_csv(args.outfile, float_format="%.2f", sep="&", line_terminator=" \\\\\n") print("Saved to " + args.outfile) else: with pd.option_context("display.max_rows", None, "display.max_columns", None): print(df.T)
def earncost(): if request.method=='GET': earntarget = 60000 tuitiontarget = 10000 nn = 5 else: earntarget = int(request.form['earnings']) tuitiontarget = int(request.form['tuition']) nn = int(request.form['viewsize']) # run function to get tables cols = df.columns dfs = {} for percent, col in zip(['50%', '25%', '10%'], cols[3:]): outcols = list(cols[:3]) outcols.append(col) # inst, state, tuition, earnings dfres = nncalc(outcols, earntarget, tuitiontarget, nn) dfres.columns = ['Institution', 'State', 'Annual Tuition', 'Reported Earnings'] dfres.sort_values('Annual Tuition', inplace=True, ascending=True) with pd.option_context('max_colwidth', -1): testhtml = dfres.to_html(index=False, escape=False, classes='table table-condensed table-striped table-bordered') testhtml = testhtml.replace('border="1" ', '').replace('class="dataframe ', 'class="') testhtml = testhtml.replace(' style="text-align: right;"', '').replace('&', '&') dfs[percent] = testhtml # modification date updated = moddate() return render_template('earncost.html', updated=updated, dfs=dfs, earnings=earntarget, tuition=tuitiontarget, viewsize=nn)
def test_repr_max_seq_item_setting(self): # GH10182 idx = self.create_index() idx = idx.repeat(50) with pd.option_context("display.max_seq_items", None): repr(idx) self.assertFalse('...' in str(idx))
def test_ignore_display_max_colwidth(method, expected, max_colwidth): # see gh-17004 df = DataFrame([lorem_ipsum]) with pd.option_context('display.max_colwidth', max_colwidth): result = getattr(df, method)() expected = expected(max_colwidth) assert expected in result
def _split_symbol_mappings(df, exchanges): """Split out the symbol: sid mappings from the raw data. Parameters ---------- df : pd.DataFrame The dataframe with multiple rows for each symbol: sid pair. exchanges : pd.DataFrame The exchanges table. Returns ------- asset_info : pd.DataFrame The asset info with one row per asset. symbol_mappings : pd.DataFrame The dataframe of just symbol: sid mappings. The index will be the sid, then there will be three columns: symbol, start_date, and end_date. """ mappings = df[list(mapping_columns)] with pd.option_context('mode.chained_assignment', None): mappings['sid'] = mappings.index mappings.reset_index(drop=True, inplace=True) # take the most recent sid->exchange mapping based on end date asset_exchange = df[ ['exchange', 'end_date'] ].sort_values('end_date').groupby(level=0)['exchange'].nth(-1) _check_symbol_mappings(mappings, exchanges, asset_exchange) return ( df.groupby(level=0).apply(_check_asset_group), mappings, )
def test_register_writer(self): # some awkward mocking to test out dispatch and such actually works called_save = [] called_write_cells = [] class DummyClass(ExcelWriter): called_save = False called_write_cells = False supported_extensions = ['xlsx', 'xls'] engine = 'dummy' def save(self): called_save.append(True) def write_cells(self, *args, **kwargs): called_write_cells.append(True) def check_called(func): func() assert len(called_save) >= 1 assert len(called_write_cells) >= 1 del called_save[:] del called_write_cells[:] with pd.option_context('io.excel.xlsx.writer', 'dummy'): register_writer(DummyClass) writer = ExcelWriter('something.xlsx') assert isinstance(writer, DummyClass) df = tm.makeCustomDataframe(1, 1) check_called(lambda: df.to_excel('something.xlsx')) check_called( lambda: df.to_excel( 'something.xls', engine='dummy'))
def get_data(stream, parameters, fmt): """Retrieve data for given stream and parameters, or None if not found""" sds = kp.db.StreamDS() if stream not in sds.streams: log.error("Stream '{}' not found in the database.".format(stream)) return params = {} if parameters: for parameter in parameters: if '=' not in parameter: log.error( "Invalid parameter syntax '{}'\n" "The correct syntax is 'parameter=value'". format(parameter) ) continue key, value = parameter.split('=') params[key] = value data = sds.get(stream, fmt, **params) if data is not None: with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(data) else: sds.help(stream)
def test_sum_overflow(self, use_bottleneck): with pd.option_context('use_bottleneck', use_bottleneck): # GH#6915 # overflowing on the smaller int dtypes for dtype in ['int32', 'int64']: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert int(result) == v.sum(dtype='int64') result = s.min(skipna=False) assert int(result) == 0 result = s.max(skipna=False) assert int(result) == v[-1] for dtype in ['float32', 'float64']: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert result == v.sum(dtype=dtype) result = s.min(skipna=False) assert np.allclose(float(result), 0.0) result = s.max(skipna=False) assert np.allclose(float(result), v[-1])
def main(): game_details = Readme() #read in game details for all games by conference ListofTeams = TeamList(game_details) #get list of teams for which there is PBP data print "Got Game Details" with pd.option_context('display.max_rows', 500, 'display.max_columns', 2): print ListofTeams ListofTeams.to_csv("ListofTeams.csv",sep = ',') ##takes a number on list of teams th Num_team_choice = int(raw_input("Please select the number corresponding to the team you want: ")) #Num_team_choice = 125 #name of team chosen Team_Choice = ListofTeams.iloc[Num_team_choice][0] ##get schedule for team of choice. returns a dictionary with columns "fullsched" and "game_days" TeamSched = schedule(Team_Choice, game_details) print "Got Schedule for %s" % Team_Choice merged_data = pbp_stats(TeamSched["fullsched"], TeamSched["game_days"], game_details, Team_Choice) print "Merged Data" #print merged_data.head(10) filename = "../Processed-PBP/%s.csv" % Team_Choice #filename = "../Processed-PBP/test.csv" merged_data.to_csv(filename,sep = '\t') print "Done!"
def test_representation_to_series(self): idx1 = TimedeltaIndex([], freq='D') idx2 = TimedeltaIndex(['1 days'], freq='D') idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) exp1 = """Series([], dtype: timedelta64[ns])""" exp2 = ("0 1 days\n" "dtype: timedelta64[ns]") exp3 = ("0 1 days\n" "1 2 days\n" "dtype: timedelta64[ns]") exp4 = ("0 1 days\n" "1 2 days\n" "2 3 days\n" "dtype: timedelta64[ns]") exp5 = ("0 1 days 00:00:01\n" "1 2 days 00:00:00\n" "2 3 days 00:00:00\n" "dtype: timedelta64[ns]") with pd.option_context('display.width', 300): for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], [exp1, exp2, exp3, exp4, exp5]): result = repr(pd.Series(idx)) assert result == expected
def test_representation(self, method): idx1 = TimedeltaIndex([], freq='D') idx2 = TimedeltaIndex(['1 days'], freq='D') idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')""" exp2 = ("TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', " "freq='D')") exp3 = ("TimedeltaIndex(['1 days', '2 days'], " "dtype='timedelta64[ns]', freq='D')") exp4 = ("TimedeltaIndex(['1 days', '2 days', '3 days'], " "dtype='timedelta64[ns]', freq='D')") exp5 = ("TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', " "'3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)") with pd.option_context('display.width', 300): for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], [exp1, exp2, exp3, exp4, exp5]): result = getattr(idx, method)() assert result == expected
def test_representation(self): idx1 = TimedeltaIndex([], freq='D') idx2 = TimedeltaIndex(['1 days'], freq='D') idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')""" exp2 = ("TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', " "freq='D')") exp3 = ("TimedeltaIndex(['1 days', '2 days'], " "dtype='timedelta64[ns]', freq='D')") exp4 = ("TimedeltaIndex(['1 days', '2 days', '3 days'], " "dtype='timedelta64[ns]', freq='D')") exp5 = ("TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', " "'3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)") with pd.option_context('display.width', 300): for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], [exp1, exp2, exp3, exp4, exp5]): for func in ['__repr__', '__unicode__', '__str__']: result = getattr(idx, func)() self.assertEqual(result, expected)
def test_representation_to_series(self): idx1 = TimedeltaIndex([], freq='D') idx2 = TimedeltaIndex(['1 days'], freq='D') idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) exp1 = """Series([], dtype: timedelta64[ns])""" exp2 = """0 1 days dtype: timedelta64[ns]""" exp3 = """0 1 days 1 2 days dtype: timedelta64[ns]""" exp4 = """0 1 days 1 2 days 2 3 days dtype: timedelta64[ns]""" exp5 = """0 1 days 00:00:01 1 2 days 00:00:00 2 3 days 00:00:00 dtype: timedelta64[ns]""" with pd.option_context('display.width', 300): for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], [exp1, exp2, exp3, exp4, exp5]): result = repr(pd.Series(idx)) self.assertEqual(result, expected)
def test_repr_array_long(self, data): # some arrays may be able to assert a ... in the repr with pd.option_context('display.max_seq_items', 1): result = repr(data) assert '...' in result assert 'length' in result
def call(self): """Print to the io object.""" self.io.write('') self.io.write('At second: {}'.format(self.kwargs.get('seconds'))) self.io.write('Starting moles: {}'.format(self.kwargs.get('moles'))) self.io.write('Activity: {:.2e}'.format(self.scenario.activity())) self.io.write('Watts: {:.2e}'.format(self.scenario.power().watts)) self.io.write('') df = self.scenario.df[[ 'parent', 'daughters', 'parent_fraction', 'q_value_mev', 'starting_moles', 'gamow_factor', 'partial_half_life', 'partial_activity', 'watts', ]] if df.empty: self.io.write('No active isotopes.') else: with pd.option_context('display.max_rows', 999, 'display.max_columns', 10): df = df.dropna().sort_values(['watts', 'gamow_factor'], ascending=[0, 1]) self.io.write(df.to_string() + '\n') self.io.write('')
def draw(self, return_ggplot=False): """ Render the complete plot Parameters ---------- return_ggplot : bool If ``True``, return ggplot object. Returns ------- fig : ~matplotlib.figure.Figure Matplotlib figure plot : ggplot (optional) The ggplot object used for drawn, if ``return_ggplot`` is ``True``. Notes ----- This method does not modify the original ggplot object. You can get the modified ggplot object with :py:`return_ggplot=True`. """ # Pandas deprecated is_copy, and when we create new dataframes # from slices we do not want complaints. We always uses the # new frames knowing that they are separate from the original. with pd.option_context('mode.chained_assignment', None): return self._draw(return_ggplot)
def _check_stat_op(self, name, alternate, string_series_, check_objects=False, check_allna=False): with pd.option_context('use_bottleneck', False): f = getattr(Series, name) # add some NaNs string_series_[5:15] = np.NaN # mean, idxmax, idxmin, min, and max are valid for dates if name not in ['max', 'min', 'mean']: ds = Series(pd.date_range('1/1/2001', periods=10)) with pytest.raises(TypeError): f(ds) # skipna or no assert pd.notna(f(string_series_)) assert pd.isna(f(string_series_, skipna=False)) # check the result is correct nona = string_series_.dropna() tm.assert_almost_equal(f(nona), alternate(nona.values)) tm.assert_almost_equal(f(string_series_), alternate(nona.values)) allna = string_series_ * np.nan if check_allna: assert np.isnan(f(allna)) # dtype=object with None, it works! s = Series([1, 2, 3, None, 5]) f(s) # GH#2888 items = [0] items.extend(lrange(2 ** 40, 2 ** 40 + 1000)) s = Series(items, dtype='int64') tm.assert_almost_equal(float(f(s)), float(alternate(s.values))) # check date range if check_objects: s = Series(pd.bdate_range('1/1/2000', periods=10)) res = f(s) exp = alternate(s) assert res == exp # check on string data if name not in ['sum', 'min', 'max']: with pytest.raises(TypeError): f(Series(list('abc'))) # Invalid axis. with pytest.raises(ValueError): f(string_series_, axis=1) # Unimplemented numeric_only parameter. if 'numeric_only' in compat.signature(f).args: with pytest.raises(NotImplementedError, match=name): f(string_series_, numeric_only=True)
def pca_signal(signal): # Data may contain "Inf" or "NaN" values for some rages, let's just skip # such values otherwise PCA will fail with pd.option_context('mode.use_inf_as_null', True): signal = signal.dropna(how="any", axis=0).T pca = PCA(n_components=2) x_r = pca.fit_transform(signal) return pca, x_r
def get_describe(df): desc = df.describe() print(desc) desc = pd.DataFrame([df.median(), df.mean(), df.std(ddof=0)], index=['median', 'mean', 'std']) # print(desc.ix[['mean', 'std']]) with pd.option_context('display.precision', 4): print(desc) return desc
def test_print_none_width(self): # GH10087 a = Series(Categorical([1, 2, 3, 4])) exp = ("0 1\n1 2\n2 3\n3 4\n" "dtype: category\nCategories (4, int64): [1, 2, 3, 4]") with option_context("display.width", None): assert exp == repr(a)
def test_setitem_chained_no_consolidate(self): # https://github.com/pandas-dev/pandas/pull/19268 # issuecomment-361696418 # chained setitem used to cause consolidation sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) with pd.option_context('mode.chained_assignment', None): sdf[0][1] = 2 assert len(sdf._data.blocks) == 2
def freqTable(grams): dataOut = open("dataOut.txt", "w") # lines = [line.strip() for line in grams if line.strip() and not line.startswith('com')] lineSer = pd.Series(grams) freq = lineSer.value_counts() freq.to_csv("dataOut.txt") with pd.option_context("display.max_rows", 999): print freq
def test_detect_chained_assignment_warnings(self): # warnings with option_context('chained_assignment', 'warn'): df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]}) with tm.assert_produces_warning( expected_warning=com.SettingWithCopyWarning): df.loc[0]['A'] = 111
print('Preparing pivot tables.') df_pvt1 = pd.pivot_table(df_inv, index=['_grp', 'YEAR', 'PERIOD'], values=['_adjvol'], columns=[], aggfunc='sum', margins=False, margins_name='Total', fill_value='') query_text = '_grp != [\'' + conf.NF + '\',\'' + conf.UD + '\']' df_out_actual = df_pvt1.query( query_text) # output pivot table - Historical demand df_out_flat = pd.DataFrame(df_out_actual.to_records() ) # *** flattened pivot table for forecasting *** with pd.option_context('display.max_rows', 100000, 'display.max_columns', 6, 'display.expand_frame_repr', False): print(df_out_flat[['_grp', 'YEAR', 'PERIOD', '_adjvol']]) # df_pvt2 will be used for reporting out chemical prefixes without chemical groups (df_not_grouped). df_pvt2 = pd.pivot_table(df_inv, index=['_grp', '_pref'], values=['_adjvol'], columns=[], aggfunc='sum', margins=False, margins_name='Total', fill_value='') query_text = '_grp == [\'' + conf.NF + '\',\'' + conf.UD + '\']' df_not_grouped = df_pvt2.query( query_text) # chemical prefixes remained to be manually grouped in Excel
def print_pandas_array(self, array): import pandas as pd if len(array) > 0: with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(array)
def _get_formatted_values(self): with option_context('display.max_colwidth', 999999): fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} return fmt_values
def build(source_index, dest_index, W=10): _dataset = load_dataset(source_index, return_index=True) for _sym, entry in _dataset.items(): _df = pd.read_csv(entry['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) _target = pd.read_csv(entry['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) ohlcv = _df[entry['features']['ohlcv']] ohlcv_d = { d: _df[entry['features']['ohlcv_{}d'.format(d)]] for d in [3, 7, 30] } ta_d = { d: _df[entry['features']['ta_{}d'.format(d)]] for d in [3, 7, 30] } ta = _df[entry['features']['ta']] cm = _df[entry['features']['cm']] cm_picked = pd.DataFrame(index=ohlcv.index) if 'adractcnt' in cm.columns: cm_picked['adractcnt_pct'] = cm.adractcnt.pct_change() # cm_picked['adractcnt_mean3_pct'] = cm.adractcnt.rolling(3).mean().pct_change() # cm_picked['adractcnt_mean7_pct'] = cm.adractcnt.rolling(7).mean().pct_change() # if 'splycur' in cm.columns: ## Correlated with volume and close # cm_picked['vol_supply'] = ohlcv.volume / cm.splycur # Ratio between transacted volume and total supply (mined) if 'txtfrvaladjntv' in cm.columns and 'isstotntv' in cm.columns and 'feetotntv' in cm.columns: # I want to represent miners earnings (fees + issued coins) vs amount transacted in that interval cm_picked['earned_vs_transacted'] = ( cm.isstotntv + cm.feetotntv) / cm.txtfrvaladjntv if 'isstotntv' in cm.columns: # isstotntv is total number of coins mined in the time interval # splycur is total number of coins mined (all time) total_mined = cm.isstotntv.rolling( 365, min_periods=7).sum() # total mined in a year cm_picked['isstot365_isstot1_pct'] = (cm.isstotntv / total_mined).pct_change() if 'splycur' in cm.columns and 'isstotntv' in cm.columns: cm_picked['splycur_isstot1_pct'] = (cm.isstotntv / cm.splycur).pct_change() if 'hashrate' in cm.columns: #cm_picked['hashrate_mean3_pct'] = cm.hashrate.rolling(3).mean().pct_change() #cm_picked['hashrate_mean7_pct'] = cm.hashrate.rolling(7).mean().pct_change() cm_picked['hashrate_pct'] = cm.hashrate.pct_change() if 'roi30d' in cm.columns: cm_picked['roi30d'] = cm.roi30d if 'isstotntv' in cm.columns: cm_picked['isstotntv_pct'] = cm.isstotntv.pct_change() if 'feetotntv' in cm.columns: cm_picked['feetotntv_pct'] = cm.feetotntv.pct_change() if 'txtfrcount' in cm.columns: cm_picked['txtfrcount_pct'] = cm.txtfrcount.pct_change() #cm_picked['txtfrcount_volume'] = cm.txtfrcount.pct_change() if 'vtydayret30d' in cm.columns: cm_picked['vtydayret30d'] = cm.vtydayret30d if 'isscontpctann' in cm.columns: cm_picked['isscontpctann'] = cm.isscontpctann ta_picked = pd.DataFrame(index=ta.index) # REMA / RSMA are already used and well-estabilished in ATSA, # I'm taking the pct change since i want to encode the relative movement of the ema's not their positions # ta_picked['rema_5_20_pct'] = ta.rema_5_20.pct_change() ta_picked['rema_8_15_pct'] = ta.rema_8_15.pct_change() # ta_picked['rema_20_50_pct'] = ta.rema_20_50.pct_change() # ta_picked['rsma_5_20_pct'] = ta.rema_5_20.pct_change() ta_picked['rsma_8_15_pct'] = ta.rema_8_15.pct_change() # ta_picked['rsma_20_50_pct'] = ta.rema_20_50.pct_change() # Stoch is a momentum indicator comparing a particular closing price of a security to a range of its prices # over a certain period of time. # The sensitivity of the oscillator to market movements is reducible by adjusting that time period or # by taking a moving average of the result. # It is used to generate overbought and oversold trading signals, utilizing a 0-100 bounded range of values. # IDEA => decrease sensitivity by 3-mean and divide by 100 to get fp values ta_picked['stoch_14_mean3_div100'] = ta.stoch_14.rolling( 3).mean() / 100 #Moving Average Convergence Divergence (MACD) is a trend-following momentum indicator that shows # the relationship between two moving averages of a security’s price. # The MACD is calculated by subtracting the 26-period Exponential Moving Average (EMA) from the 12-period EMA. # A nine-day EMA of the MACD called the "signal line," is then plotted on top of the MACD line, # which can function as a trigger for buy and sell signals. # Traders may buy the security when the MACD crosses above its signal line and sell - or short - the security # when the MACD crosses below the signal line. # Moving Average Convergence Divergence (MACD) indicators can be interpreted in several ways, # but the more common methods are crossovers, divergences, and rapid rises/falls. signal_line = builder.exponential_moving_average(ta.macd_12_26, 9) ta_picked[ 'macd_12_26_signal'] = signal_line # Relationship with signal line ta_picked['macd_12_26_diff_signal'] = ( ta.macd_12_26 - signal_line).pct_change() # Relationship with signal line ta_picked['macd_12_26_pct'] = ta.macd_12_26.pct_change( ) # Information about slope # PPO is identical to the moving average convergence divergence (MACD) indicator, # except the PPO measures percentage difference between two EMAs, while the MACD measures absolute (dollar) difference. signal_line = builder.exponential_moving_average(ta.ppo_12_26, 9) ta_picked[ 'ppo_12_26_signal'] = signal_line # Relationship with signal line ta_picked['ppo_12_26_diff_signal'] = ( ta.ppo_12_26 - signal_line).pct_change() # Relationship with signal line ta_picked['ppo_12_26_pct'] = ta.ppo_12_26.pct_change( ) # Information about slope # ADI Accumulation/distribution is a cumulative indicator that uses volume and price to assess whether # a stock is being accumulated or distributed. # The accumulation/distribution measure seeks to identify divergences between the stock price and volume flow. # This provides insight into how strong a trend is. If the price is rising but the indicator is falling # this indicates that buying or accumulation volume may not be enough to support # the price rise and a price decline could be forthcoming. # ==> IDEA: if we can fit a line to the price y1 = m1X+q1 and a line to ADI y2=m2X+q2 then we can identify # divergences by simply looking at the sign of M. # Another insight would be given by the slope (ie pct_change) ta_picked['adi_pct'] = ta.adi.pct_change() ta_picked['adi_close_convergence'] = convergence_between_series( ta.adi, ohlcv.close, 3) # RSI goes from 0 to 100, values <= 20 mean BUY, while values >= 80 mean SELL. # Dividing it by 100 to get a floating point feature, makes no sense to pct_change it ta_picked['rsi_14_div100'] = ta.rsi_14 / 100 # The Money Flow Index (MFI) is a technical indicator that generates overbought or oversold # signals using both prices and volume data. The oscillator moves between 0 and 100. # An MFI reading above 80 is considered overbought and an MFI reading below 20 is considered oversold, # although levels of 90 and 10 are also used as thresholds. # A divergence between the indicator and price is noteworthy. For example, if the indicator is rising while # the price is falling or flat, the price could start rising. ta_picked['mfi_14_div100'] = ta.mfi_14 / 100 # The Chande momentum oscillator is a technical momentum indicator similar to other momentum indicators # such as Wilder’s Relative Strength Index (Wilder’s RSI) and the Stochastic Oscillator. # It measures momentum on both up and down days and does not smooth results, triggering more frequent # oversold and overbought penetrations. The indicator oscillates between +100 and -100. # Many technical traders add a 10-period moving average to this oscillator to act as a signal line. # The oscillator generates a bullish signal when it crosses above the moving average and a # bearish signal when it drops below the moving average. ta_picked['cmo_14_div100'] = ta.cmo_14 / 100 signal_line = builder.simple_moving_average(ta.cmo_14, 10) ta_picked['cmo_14_signal'] = signal_line ta_picked['cmo_14_diff_signal'] = (ta.cmo_14 - signal_line) / 100 # On-balance volume (OBV) is a technical trading momentum indicator that uses volume flow to predict changes in stock price. # Eventually, volume drives the price upward. At that point, larger investors begin to sell, and smaller investors begin buying. # Despite being plotted on a price chart and measured numerically, # the actual individual quantitative value of OBV is not relevant. # The indicator itself is cumulative, while the time interval remains fixed by a dedicated starting point, # meaning the real number value of OBV arbitrarily depends on the start date. # Instead, traders and analysts look to the nature of OBV movements over time; # the slope of the OBV line carries all of the weight of analysis. => We want percent change ta_picked['obv_pct'] = ta.obv.pct_change() ta_picked['obv_mean3_pct'] = ta.obv.rolling(3).mean().pct_change() # Strong rallies in price should see the force index rise. # During pullbacks and sideways movements, the force index will often fall because the volume # and/or the size of the price moves gets smaller. # => Encoding the percent variation could be a good idea ta_picked['fi_13_pct'] = ta.fi_13.pct_change() ta_picked['fi_50_pct'] = ta.fi_50.pct_change() # The Aroon Oscillator is a trend-following indicator that uses aspects of the # Aroon Indicator (Aroon Up and Aroon Down) to gauge the strength of a current trend # and the likelihood that it will continue. # It moves between -100 and 100. A high oscillator value is an indication of an uptrend # while a low oscillator value is an indication of a downtrend. ta_picked['ao_14'] = ta.ao_14 / 100 # The average true range (ATR) is a technical analysis indicator that measures market volatility # by decomposing the entire range of an asset price for that period. # ATRP is pct_change of volatility ta_picked['atrp_14'] = ta.atrp_14 # Percentage Volume Oscillator (PVO) is momentum volume oscillator used in technical analysis # to evaluate and measure volume surges and to compare trading volume to the average longer-term volume. # PVO does not analyze price and it is based solely on volume. # It compares fast and slow volume moving averages by showing how short-term volume differs from # the average volume over longer-term. # Since it does not care a trend's factor in its calculation (only volume data are used) # this technical indicator cannot be used alone to predict changes in a trend. ta_picked['pvo_12_26'] = ta.pvo_12_26 # IGNORED: tsi, wd, adx, #lagged_stats = pd.concat([ohlcv_stats] + [builder.make_lagged(ohlcv_stats, i) for i in range(1,10+1)], axis='columns', verify_integrity=True, sort=True, join='inner') # Build the dataframe with base features # lagged_close = pd.concat([ohlcv.close.pct_change()] + [builder.make_lagged(ohlcv.close.pct_change(), i) for i in range(1,10+1)], axis='columns', verify_integrity=True, sort=True, join='inner') # lagged_close.columns = ['close_pct'] + ['close_pct_lag-{}'.format(i) for i in range(1, W +1)] ohlc = ohlcv[['open', 'high', 'low', 'close', 'volume']].pct_change() ohlc.columns = ['{}_pct'.format(c) for c in ohlcv.columns] lagged_ohlc_pct = pd.concat( [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)], axis='columns', verify_integrity=True, sort=True, join='inner') _time = pd.DataFrame(index=ohlcv.index) _time['day_of_year'] = ohlcv.index.dayofyear _time['day_of_week'] = ohlcv.index.dayofweek ohlc = ohlcv[['open', 'high', 'low', 'close', 'volume']] x_space = np.linspace(0, ohlc.index.size, ohlc.index.size) _splines = pd.DataFrame(index=ohlcv.index) # Highly correlated between themselves, no use # _splines['open_spl'] = get_spline(ohlc.open, 0) # _splines['high_spl'] = get_spline(ohlc.high, 0) # _splines['low_spl'] = get_spline(ohlc.low, 0) # _splines['close_spl'] = get_spline(ohlc.close, 0) _splines['open_spl_d1'] = builder.get_spline(ohlc.open, 1) _splines['high_spl_d1'] = builder.get_spline(ohlc.high, 1) _splines['low_spl_d1'] = builder.get_spline(ohlc.low, 1) _splines['close_spl_d1'] = builder.get_spline(ohlc.close, 1) _splines['open_spl_d2'] = builder.get_spline(ohlc.open, 2) _splines['high_spl_d2'] = builder.get_spline(ohlc.high, 2) _splines['low_spl_d2'] = builder.get_spline(ohlc.low, 2) _splines['close_spl_d2'] = builder.get_spline(ohlc.close, 2) _patterns = builder.get_talib_patterns(ohlcv) _new_features = pd.DataFrame(index=ohlcv.index) _new_features['candlestick_patterns_mean'] = _patterns.mean(axis=1) _new_features['candlestick_patterns_sum'] = _patterns.sum(axis=1) # WE LIKE THESE TWO!!!! _new_features['close_volatility_7d'] = ohlcv.close.pct_change( ).rolling(7).std(ddof=0) _new_features['close_volatility_30d'] = ohlcv.close.pct_change( ).rolling(30).std(ddof=0) # # Candle body size variation, for example _new_features['close_open_pct'] = ( ohlcv.close - ohlcv.open ).pct_change() # Change in body of the candle (> 0 if candle is green) _new_features['high_close_dist_pct'] = ( ohlcv.high - ohlcv.close ).pct_change( ) # Change in wick size of the candle, shorter wick should be bullish _new_features['low_close_dist_pct'] = ( ohlcv.close - ohlcv.low ).pct_change( ) # Change in shadow size of the candle, this increasing would indicate support (maybe a bounce) _new_features['high_low_dist_pct'] = ( ohlcv.high - ohlcv.low ).pct_change( ) # Change in total candle size, smaller candles stands for low volatility for d in [3, 7, 30]: ohlcv_d[d].columns = ['close', 'high', 'low', 'open', 'volume'] _new_features['close_open_pct_d{}'.format(d)] = ( ohlcv_d[d].close - ohlcv_d[d].open).pct_change() _new_features['high_close_dist_pct_d{}'.format(d)] = ( ohlcv_d[d].high - ohlcv_d[d].close).pct_change() _new_features['low_close_dist_pct_d{}'.format(d)] = ( ohlcv_d[d].close - ohlcv_d[d].low).pct_change() _new_features['high_low_dist_pct_d{}'.format(d)] = ( ohlcv_d[d].high - ohlcv_d[d].low).pct_change() _ta_windowed_features = pd.concat([ v.rename(columns={c: '{}_ta{}d'.format(c, d) for c in v.columns}) for d, v in ta_d.items() ], axis=1) # Add lagged features to the dataframe ta.columns = ['{}_ta1d'.format(c) for c in ta.columns] feature_groups = [ _new_features, _splines, lagged_ohlc_pct, cm_picked, ta_picked, _ta_windowed_features, ta ] improved_df = pd.concat(feature_groups, axis='columns', verify_integrity=True, sort=True, join='inner') # Drop the first 30 rows improved_df = improved_df[30:] # Drop columns whose values are all nan or inf with pd.option_context('mode.use_inf_as_na', True): # Set option temporarily improved_df = improved_df.dropna(axis='columns', how='all') logger.info('Saving {}'.format(_sym)) save_symbol_dataset(dest_index, _sym, improved_df, target=_target) logger.info('Saved {}'.format(_sym))
def update_results(): """Update the results table after a batch, cell or model selection is changed. """ user = get_current_user() session = Session() nullselection = """ MUST SELECT A BATCH AND ONE OR MORE CELLS AND ONE OR MORE MODELS BEFORE RESULTS WILL UPDATE """ bSelected = request.args.get('bSelected') cSelected = request.args.getlist('cSelected[]') mSelected = request.args.getlist('mSelected[]') colSelected = request.args.getlist('colSelected[]') # If no batch, cell or model is selected, display an error message. if (len(bSelected) == 0) or (not cSelected) or (not mSelected): return jsonify(resultstable=nullselection) # Only get numerals for selected batch. bSelected = bSelected[:3] # Use default value of 500 if no row limit is specified. rowlimit = request.args.get('rowLimit', 500) ordSelected = request.args.get('ordSelected') # Parse string into appropriate sqlalchemy method if ordSelected == 'asc': ordSelected = asc elif ordSelected == 'desc': ordSelected = desc sortSelected = request.args.get('sortSelected', 'cellid') # Always add cellid and modelname to column lists, # since they are required for selection behavior. cols = [ getattr(NarfResults, 'cellid'), getattr(NarfResults, 'modelname'), ] cols += [ getattr(NarfResults, c) for c in colSelected if hasattr(NarfResults, c) ] # Package query results into a DataFrame results = psql.read_sql_query( Query(cols, session).filter(NarfResults.batch == bSelected).filter( NarfResults.cellid.in_(cSelected)).filter( NarfResults.modelname.in_(mSelected)).filter( or_( int(user.sec_lvl) == 9, NarfResults.public == '1', NarfResults.labgroup.ilike('%{0}%'.format( user.labgroup)), NarfResults.username == user.username, )).order_by(ordSelected(getattr( NarfResults, sortSelected))).limit(rowlimit).statement, session.bind) with pd.option_context('display.max_colwidth', -1): resultstable = results.to_html( index=False, classes="table-hover table-condensed", ) session.close() return jsonify(resultstable=resultstable)
def test_config_default_off(self): df = pd.DataFrame({"A": [1, 2]}) with pd.option_context("display.html.table_schema", False): result = df._repr_data_resource_() assert result is None
def main(): # Build new directories new_dir(args.path) export_path = args.path + "/export_conll" new_dir(export_path) import_path = args.path + "/import_conll" new_dir(import_path) StringProcessor = string_preprocessing.Preprocessor() # Get source text if args.txt: with open(args.txt) as f: raw_text = f.read() all_sentences = StringProcessor.process(raw_text) elif args.copy_beware_columns: all_sentences, all_conlls = conll_and_spacy.ConllSpacyUpdater.load_all_conlls( export_path) else: raise OSError("Either txt or conll must be given to know the text") if args.subdivision: all_sentences, subdivision_structure = StringProcessor.extract_subdivision_structure( args.subdivision, all_sentences) GrammarParser = parse_grammar.GrammarParser() ConllUpdater = conll_and_spacy.ConllSpacyUpdater(export_dir=export_path, import_dir=import_path) # Fit spacy sparses with bad sentence chunking into well chunked nltk sentences and build the new conlls chunk_width = 10 line_chunks, non_over_lapping_intervall = batch_splitter(all_sentences, chunk_width, overlap_margin=5) s_counter = count(0) # Count generator for j = next(s_counter) # Index of sentence in the Corpus conll_df = pd.DataFrame() # Compute the parts of the corpus als blocks of sentence lists, ignore the margins for coref resolution, handle corefs for each block. for i, chintervall in enumerate( list(zip(line_chunks, non_over_lapping_intervall))): corpus_index = chunk_width * i ch, intervall = chintervall chunk_text = " ".join(ch) spacy_neucoref_doc = GrammarParser.process(chunk_text) spacy_position = 0 conll_dict = [] for ch_j, sentence_from_chunk in enumerate(ch): tokens = StringProcessor.tokenize_text_to_words( sentence_from_chunk) start_token = tokens[0] start_pos = 0 try: end_pos, last_token = next( (i, t) for i, t in list(enumerate(tokens))[::-1] if t not in ['.', '?', '!']) except StopIteration: logging.error( "no last token here? '%s' for sentence no %d: '%s'" % (str(tokens), j, str(sentence_from_chunk))) if '/' in last_token: last_token = last_token.split('/')[-1] sent_start = conll_and_spacy.find_position_in_doc_by_approx( spacy_neucoref_doc, start_token, spacy_position + start_pos) sent_end = conll_and_spacy.find_position_in_doc_by_approx( spacy_neucoref_doc, last_token, spacy_position + end_pos) if tokens[-1] in ['.', '?', '!']: dot = 1 else: dot = 0 sentence_from_spacy = spacy_neucoref_doc[sent_start:sent_end + 1 + dot] spacy_position = sent_start + len(tokens) if not ch_j in range(*intervall): continue if args.copy_beware_columns: ConllUpdater.conll_over_spacy(sentence_from_spacy, import_path, j, no_cols=args.copy_beware_columns) conll_dict.extend( ConllUpdater.export_dict(sentence_from_spacy, index=j)) j = next(s_counter) single_chunk_conll_df = pd.DataFrame(conll_dict) ConllUpdater.annotate_corefs(spacy_neucoref_doc, single_chunk_conll_df) # Updates the df with the coref-annotations as whole doc block, because the coref annotations are not complete, asking the tokens. #single_chunk_conll_df = single_chunk_conll_df.iloc[range(*intervall)] conll_df = conll_df.append(single_chunk_conll_df, ignore_index=True) # Groupby df sometimes doesn't contain the column, that it is grouped by. Copy this! conll_df['sent_id'] = conll_df['s_id'] # Write all the conll files conll_df.groupby( ['s_id']).apply(lambda x: ConllUpdater.write_conll_by_df_group(x)) with open(export_path + "/lemmas.txt", 'w+') as f: f.write(" ".join(conll_df['lemma'].tolist())) with open(export_path + "/subdivision.txt", 'w+') as f: f.write(str(subdivision_structure)) test_fun = test_equality_of_sentences(all_sentences) test_df = conll_df.groupby(['sent_id']).apply(lambda x: test_fun(x)) with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(test_df) return 0
to_replace=unit, value="Year") else: continue #Filling missing values with median \n", df['pw_amount_9089'] = df['pw_amount_9089'].fillna( (df['pw_amount_9089'].median())) #Changing format from string to float\n", df['pw_amount_9089'] = df.pw_amount_9089.astype(float) #Displaying 10 first values\n", df[['pw_amount_9089', 'pw_unit_of_pay_9089']].head(10) # In[31]: #Since running \"describe\" method on \"pw_amount_9089\" column returned exponential values, I decided to \n", #convert them to floats so that they are easier to understand\n", with pd.option_context('float_format', '{:.2f}'.format): print(df.pw_amount_9089.describe()) # In[32]: #Dividing our continuous income values into some categories to facilitate their visualization\n", df['remuneration'] = pd.cut(df['pw_amount_9089'], [ 0, 30000, 60000, 90000, 120000, 150000, 180000, 210000, 240000, 270000, 495748000 ], right=False, labels=[ "0-30k", "30-60k", "60-90k", "90-120k", "120-150k", "150-180k", "180-210k", "210-240k", "240-270k", "270k+" ])
def test_repr_max_rows(self): # GH 6863 with pd.option_context('max_rows', None): str(Series(range(1001))) # should not raise exception
def test_repr_with_unicode_data(): with pd.option_context("display.encoding", "UTF-8"): d = {"a": ["\u05d0", 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} index = pd.DataFrame(d).set_index(["a", "b"]).index assert "\\" not in repr(index) # we don't want unicode-escaped
def show_matrix(m): with pd.option_context('display.float_format', lambda x: "%g" % x): display(pd.DataFrame(m))
movies_links.append(wiki_prefix + link.get('href')) #import pandas to convert list to data frame import pandas as pd df = pd.DataFrame(A, columns=['Title']) df['Year'] = B df['Role'] = C df['Director'] = D df['Links'] = movies_links df.sort_values("Year") question_1 = df.copy().drop("Links", axis=1) with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(question_1) ###### Question 2 #Get all actors from all movies: # for movie_link in df["Links"]: # if movie_link != "": # page = urlopen(movie_link) # soup = BeautifulSoup(page, features="html.parser") # for elem in soup.find_all("h2", text=re.compile(r'[cast|Cast]')): # print (elem) # # nu = soup.findAll('h2', re.compile("(cast|Cast)")) # # print (nu) actors_list = []
trade_size=Decimal(1_000_000), order_id_tag="001", ) # Instantiate and add your strategy strategy = EMACrossBracket(config=config) engine.add_strategy(strategy=strategy) input("Press Enter to continue...") # noqa (always Python 3) # Run the engine (from start to end of data) engine.run() # Optionally view reports with pd.option_context( "display.max_rows", 100, "display.max_columns", None, "display.width", 300, ): print(engine.trader.generate_account_report(SIM)) print(engine.trader.generate_order_fills_report()) print(engine.trader.generate_positions_report()) # For repeated backtest runs make sure to reset the engine engine.reset() # Good practice to dispose of the object when done engine.dispose()
def test_summary(self, file_name=None): self.assert_all_tested() columns = [{ "title": "name", "key": "name", "type": str }, { "title": "mean costs", "key": "test_mean_costs", "type": float }, { "title": "variance costs", "key": "test_variance_costs", "type": float }, { "title": "mean wealth", "key": "test_mean_wealth", "type": float }, { "title": "variance wealth", "key": "test_variance_wealth", "type": float }, { "title": "mean abs wealth w. price", "key": "test_wealth_with_price_abs_mean", "type": float }, { "title": "variance wealth w. price", "key": "test_wealth_with_price_variance", "type": float }, { "title": "risk (train)", "key": "train_risk", "type": float }, { "title": "risk (test)", "key": "test_risk", "type": float }, { "title": "price", "key": "price", "type": float }, { "title": "training time", "key": "train_time", "type": float }, { "title": "trainable vars", "key": "trainable_variables", "type": int }, { "title": "non-trainable vars", "key": "non_trainable_variables", "type": int }] dictionary = {} for col in columns: dictionary[col["title"]] = [col["type"](case[col["key"]]) \ for case in self.testcases] df = pd.DataFrame(dictionary) appendum = "\n\n" + f"payoff: {self.test_mean_payoff: .6f}" if file_name is not None: df.to_csv(file_name, index=False) with open(file_name, "a") as file: file.write(appendum) with pd.option_context('display.max_columns', None): print(df)
gnb.fit(X_train, categoria_encoded) ######################################### TEST ######################################### X_counts2 = vectorizer.transform(df['Des_limpio']).toarray() predicted = gnb.predict(X_counts2) print("\n") print( "######################################### Input data: ######################################### " + "\n") print(df[['Descripción', 'Cargos (CLP)', 'Abonos (CLP)', 'Saldo (CLP)']]) # print("######################################### Los cargos de entrada son: ######################################### " + "\n") print("\n") print( "######################################### Resultado del modelo: ######################################### " + "\n") for numero, x in enumerate(predicted): predicciones.append(list(le.classes_)[x]) df = df.assign(Categoria=predicciones) with pd.option_context('display.max_rows', None): print(df[[ 'Descripción', 'Cargos (CLP)', 'Abonos (CLP)', 'Saldo (CLP)', 'Categoria' ]]) #"""
def evaluate(self, save_as=None): if save_as: log = open(logs_save_path + "{}.log".format(save_as), "w") else: log = open("temp_{}.log".format(time.ctime()), "w") ystd_score = sum([ self.portfolio.scores.get(s, 0) * self.portfolio.weights_ystd.get(s, 0) for s in self.portfolio.stock_pool ]) tdy_score = sum([ self.portfolio.scores.get(s, 0) * self.hold_weights.get(s, 0) for s in self.hold_weights ]) print() print("[Before Adjust] Score: {:.6f} Percentile: {:.4f}".format( ystd_score, self.score_rank(ystd_score))) print("[~After Adjust] Score: {:.6f} Percentile: {:.4f}".format( tdy_score, self.score_rank(tdy_score))) self.ystd_score_rank = self.score_rank(ystd_score) self.tdy_score_rank = self.score_rank(tdy_score) # print( "[Before Adjust] Score: {:.6f} Percentile: {:.4f}".format( ystd_score, self.score_rank(ystd_score)), file=log, ) print( "[~After Adjust] Score: {:.6f} Percentile: {:.4f}".format( tdy_score, self.score_rank(tdy_score)), file=log, ) ysd_holding = { k: v for k, v in self.portfolio.weights_ystd.items() if v > 0 } tdy_holding = { int(v.name.split("_")[1]): v.varValue for v in self.solver.variables() if v.varValue } stock_out = set(ysd_holding) - set(tdy_holding) stock_in = set(tdy_holding) - set(ysd_holding) adjust = set(tdy_holding) & set(ysd_holding) adjust = [ s for s in adjust if abs( self.portfolio.weights_ystd.get(s, 0) - self.hold_weights.get(s, 0)) > 1e-6 ] print("\nIn:{}\t Out:{}\t Adjust:{}\t".format(len(stock_in), len(stock_out), len(adjust))) print( "\nIn:{}\t Out:{}\t Adjust:{}\t".format(len(stock_in), len(stock_out), len(adjust)), file=log, ) buyin_tvr = sum([self.hold_weights.get(s, 0) for s in stock_in]) adj_tvr = sum([ max( self.hold_weights.get(s, 0) - self.portfolio.weights_ystd.get(s, 0), 0, ) for s in adjust ]) print( "\n>> Turnover: \n Buy in {:7.4f} Adjust {:.4f} Total {:.4f}" .format(buyin_tvr, adj_tvr, buyin_tvr + adj_tvr)) print( "\n>> Turnover: \n Buy in {:7.4f} Adjust {:.4f} Total {:.4f}" .format(buyin_tvr, adj_tvr, buyin_tvr + adj_tvr), file=log, ) self.total_tvr = buyin_tvr + adj_tvr tdy_values, ystd_values = {}, {} for var in self.portfolio.continuous: tdy_values[var] = sum([ getattr(self.portfolio, var)[s] * self.hold_weights.get(s, 0) for s in self.hold_weights ]) ystd_values[var] = sum([ getattr(self.portfolio, var)[s] * self.portfolio.weights_ystd.get(s, 0) for s in self.portfolio.stock_pool ]) print( "\n>> {}:\n Before {:7.4f} After {:.4f} Target {:.4f} Δ {:7.4f} " .format( var, ystd_values[var], tdy_values[var], getattr(self.portfolio, "{}_constraint".format(var)), tdy_values[var] - getattr(self.portfolio, "{}_constraint".format(var)), )) print( "\n>> {}:\n Before {:7.4f} After {:.4f} Target {:.4f} Δ {:7.4f} " .format( var, ystd_values[var], tdy_values[var], getattr(self.portfolio, "{}_constraint".format(var)), tdy_values[var] - getattr(self.portfolio, "{}_constraint".format(var)), ), file=log, ) df_info = {} for var in self.portfolio.descrete: df_info[var] = pd.DataFrame( {"before": getattr(self.portfolio, "{}_weights".format(var))}) after = { sc: sum([ w * self.is_this_category(s, sc, var) for s, w in self.hold_weights.items() ]) for sc in getattr(self.portfolio, "{}_list".format(var)) } df_info[var]["after"] = [after.get(x) for x in df_info[var].index] df_info[var]["target"] = [ getattr(self.portfolio, "{}_constraint".format(var)).get(x) for x in df_info[var].index ] df_info[var]["Δ"] = df_info[var]["after"] - df_info[var]["target"] with pd.option_context("display.max_rows", 8): print( "\n>> {}:\n\n".format(var), df_info[var].sort_values(by="Δ", ascending=False), ) print( "\n>> {}:\n\n".format(var), df_info[var].sort_values(by="Δ", ascending=False), file=log, ) print("\n\nNew buy in:\n", file=log) for i, s in enumerate(stock_in): print( " {:0>6} {}% -> {:.2f}% {}".format( s, 0, 100 * self.hold_weights.get(s, 0), [" ", "*"][s in self.reach_max], ), end=[" | ", "\n"][(i + 1) % 3 == 0], file=log, ) print("\n\nSell:\n", file=log) for i, s in enumerate(stock_out): print( " {:0>6} {:.2f}% -> {}%".format( s, 100 * self.portfolio.weights_ystd.get(s, 0), 0), end=[" | ", "\n"][(i + 1) % 3 == 0], file=log, ) print("\n\nAdjust:\n", file=log) for i, s in enumerate(adjust): print( " {:0>6} {:.2f}% -> {:.2f}%".format( s, 100 * self.portfolio.weights_ystd.get(s, 0), 100 * self.hold_weights.get(s, 0), ), end=[" | ", "\n"][(i + 1) % 3 == 0], file=log, ) self.result = pd.Series( {s: self.hold_weights.get(s, 0) for s in tdy_holding}, name="weights") self.result.index.name = "Symbol" if save_as: self.result.sort_index().to_csv(weights_save_path + "{}.csv".format(save_as)) log.close() return self.result.sort_index()
def printpd(o): with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also print(o)
def test_detect_chained_assignment_warnings(self): with option_context("chained_assignment", "warn"): df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) with tm.assert_produces_warning(com.SettingWithCopyWarning): df.loc[0]["A"] = 111
def display( frame: pd.DataFrame, order: List[str], guess: Optional[np.ndarray] = None, mode: str = "default", decimal: int = 2, ) -> None: """ Display the frame or guess through IPython. :param frame: A pd.DataFrame :param order: A permutation of ["phi", "r", "z"] :param guess: A prediction probability matrix. :param mode: One of ["default", "pairs"] If "pairs", then the answer is displayed in the same cell as the "guess" prediction. Format: "`ANSWER`[PREDICTION]" :param decimal: How many decimals places to round the guesses to. :return: None. """ table = pd.DataFrame(ext.extract_input(frame, order), columns=order) target = ext.extract_output(frame, order).round(0) if target.shape[1] > 1: column = [chr(65 + i) for i in range(target.shape[1] - 2)] noise = frame["noise"].any() padding = frame["padding"].any() column.append("noise" if noise else chr(65 + target.shape[1] - 2)) column.append("padding" if padding else chr(65 + target.shape[1] - 1)) else: column = [chr(65)] if mode == "guess": out_table = pd.DataFrame(data=guess, columns=column).replace(0, "") table = pd.concat([table, out_table], axis=1) elif mode == "discrete pairs": guess = metrics.discrete(guess).round(0) data = [] for x in range(len(guess)): row = [] for y in range(len(guess[x])): if target[x, y] == 0 and guess[x, y] == 0: row.append("") else: t, g = int(target[x, y]), int(guess[x, y]) row.append("`{0}`[{1}]".format(t, g)) data.append(row) out_table = pd.DataFrame(data=data, columns=column) table = pd.concat([table, out_table], axis=1) elif mode == "pairs" and guess is not None: guess = guess.round(decimal) data = [] for x in range(len(guess)): row = [] for y in range(len(guess[x])): if target[x, y] == 0 and guess[x, y] == 0: row.append("") else: t, g = int(target[x, y]), np.round(guess[x, y], 2) row.append("`{0}`[{1}]".format(t, g)) data.append(row) out_table = pd.DataFrame(data=data, columns=column) table = pd.concat([table, out_table], axis=1) else: out_table = pd.DataFrame(data=target, columns=column).replace(0, "") table = pd.concat([table, out_table], axis=1) with pd.option_context('display.max_columns', 0): IPython.display.display(table)
def test_repr_max_seq_item_setting(idx): # GH10182 idx = idx.repeat(50) with pd.option_context("display.max_seq_items", None): repr(idx) assert "..." not in str(idx)
def main(): name = 'Sioux Falls Network' ######################## # bootstrap parameters # ######################## boot = 5 ################################# # initialize discrete event env # ################################# env = simpy.Environment() # use instant simulation # env = simpy.rt.RealtimeEnvironment(factor=1.) # use real time simulation # setup simulation processes bsProcess = [] for b in range(boot): bs = Bootstrap(env) env.process(bs.processSimulation()) bsProcess.append(bs) start_time = timeit.default_timer() # start simulation timer env.run() end_time = timeit.default_timer() # end simulation timer # compile simulation statistics bsTable = None for n, bootstrap in enumerate(bsProcess): df = pd.DataFrame(sorted(bootstrap.sim.data, key=lambda x: x[3]), columns=['carID', 'link', 'event', 'time', 'queue', 't_queue']) meanQlength = df.loc[df['event'] == 'departure'][ ['link', 'queue']].groupby(['link']).mean() meanQlength.columns=['mean'] varQlength = df.loc[df['event'] == 'departure'][ ['link', 'queue']].groupby(['link']).var() varQlength.columns=['variance'] maxQlength = df.loc[df['event'] == 'departure'][ ['link', 'queue']].groupby(['link']).max() maxQlength.columns=['max'] if bsTable is None: bsTable = maxQlength bsTable.columns = [1] else: bsTable[n+1] = maxQlength mean = bsTable.mean(axis=1) mse = bsTable.var(axis=1, ddof=0) bsTable['mean'] = mean bsTable['MSE'] = mse print('Simulation runtime: %.3fs' % (end_time-start_time)) with pd.option_context('expand_frame_repr', False): print(bsTable)
def long_short_trend(transaction_cost=0.0000, plot=False): start_bt_date_1yr_plus = '2014-09-13' start_bt_date = '2015-09-13' end_bt_date = '2020-09-13' df, stoxx600 = get_df_stoxx600( start_bt_date_1yr_plus=start_bt_date_1yr_plus, end_bt_date=end_bt_date) # df, stoxx600 = get_df_sp500(start_bt_date_1yr_plus=start_bt_date_1yr_plus, end_bt_date=end_bt_date) ## short index port_ret = stoxx600.loc[start_bt_date:end_bt_date].pct_change().rename( 'short_index').to_frame() ## sides return port_ret['long'] = trend_trading(df, start_bt_date, end_bt_date, sigs=(20, 50, 150), transaction_cost=0) port_ret['short_trend'] = trend_trading(df, start_bt_date, end_bt_date, sigs=(20, 50, 150), transaction_cost=0, direction='short') port_ret = port_ret.fillna(0) ## short trend side plt.rcParams["figure.dpi"] = 800 with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(port_ret['short_trend'].sort_values(ascending=False)) (-port_ret['short_trend'].fillna(0) + 1).cumprod().plot( figsize=(5, 3.5), legend=True, lw=0.75, fontsize=10).legend(loc=2) plt.ylabel('Cumulative Return') plt.savefig("short_trend_follow/short_side_pnl.png") plt.close() ## ew_eq_curves eq_eq_curves = (port_ret + 1).cumprod() plt.rcParams["figure.dpi"] = 800 eq_eq_curves.plot(figsize=(5, 3.5), legend=True, lw=0.75, fontsize=10).legend(loc=2) plt.ylabel('Cumulative Return') plt.savefig("short_trend_follow/pnls") plt.close() ## combined Pnl plt.rcParams["figure.dpi"] = 800 combined_PnL = ( port_ret['long'] - port_ret['short_trend']).rename('long_short_trend').to_frame() combined_PnL['benchmark'] = (port_ret['long'] - port_ret['short_index']) combined_PnL = (combined_PnL.fillna(0) + 1).cumprod() combined_PnL.plot(figsize=(5, 3.5), legend=True, lw=0.75, fontsize=10).legend(loc=2) plt.ylabel('Cumulative Return') plt.savefig("short_trend_follow/combined_pnl") plt.close() # eq_eq_curves['long'] = (port_ret['long'] +1).cumprod() # eq_eq_curves['short'] = (port_ret['short'] +1).cumprod() ## performance_analysis performance_analysis(eq_eq_curves['long'], eq_eq_curves['short_trend']) ## portfolio analysis return pd.DataFrame({ 'benchmark': port_stats(eq_eq_curves['long'], eq_eq_curves['short_index']), 'long_short_trend': port_stats(eq_eq_curves['long'], eq_eq_curves['short_trend']) })
def check( self, data_frame, basename=None, fullpath=None, tolerances=None, default_tolerance=None, ): """ Checks the given pandas dataframe against a previously recorded version, or generate a new file. Example:: data_frame = pandas.DataFrame.from_dict({ 'U_gas': U[0][positions], 'U_liquid': U[1][positions], 'gas_vol_frac [-]': vol_frac[0][positions], 'liquid_vol_frac [-]': vol_frac[1][positions], 'P': Pa_to_bar(P)[positions], }) dataframe_regression.check(data_frame) :param pandas.DataFrame data_frame: pandas DataFrame containing data for regression check. :param str basename: basename of the file to test/record. If not given the name of the test is used. :param str fullpath: complete path to use as a reference file. This option will ignore embed_data completely, being useful if a reference file is located in the session data dir for example. :param dict tolerances: dict mapping keys from the data_dict to tolerance settings for the given data. Example:: tolerances={'U': Tolerance(atol=1e-2)} :param dict default_tolerance: dict mapping the default tolerance for the current check call. Example:: default_tolerance=dict(atol=1e-7, rtol=1e-18). If not provided, will use defaults from numpy's ``isclose`` function. ``basename`` and ``fullpath`` are exclusive. """ try: import pandas as pd except ModuleNotFoundError: raise ModuleNotFoundError(import_error_message("Pandas")) import functools __tracebackhide__ = True assert type(data_frame) is pd.DataFrame, ( "Only pandas DataFrames are supported on on dataframe_regression fixture.\n" "Object with type '%s' was given." % (str(type(data_frame)), )) for column in data_frame.columns: array = data_frame[column] # Skip assertion if an array of strings if (array.dtype == "O") and (type(array[0]) is str): continue # Rejected: timedelta, datetime, objects, zero-terminated bytes, unicode strings and raw data assert array.dtype not in [ "m", "M", "O", "S", "a", "U", "V" ], ("Only numeric data is supported on dataframe_regression fixture.\n" "Array with type '%s' was given.\n" % (str(array.dtype), )) if tolerances is None: tolerances = {} self._tolerances_dict = tolerances if default_tolerance is None: default_tolerance = {} self._default_tolerance = default_tolerance dump_fn = functools.partial(self._dump_fn, data_frame) with pd.option_context(*self._pandas_display_options): perform_regression_check( datadir=self.datadir, original_datadir=self.original_datadir, request=self.request, check_fn=self._check_fn, dump_fn=dump_fn, extension=".csv", basename=basename, fullpath=fullpath, force_regen=self._force_regen, )
pd.read_csv('https://github.com/JamesByers/Datasets/raw/master/drinks.csv', nrows=10) # only read first 10 rows pd.read_csv('https://github.com/JamesByers/Datasets/raw/master/drinks.csv', skiprows=[1, 2]) # skip the first two rows of data # write a DataFrame out to a CSV drinks.to_csv('drinks_updated.csv') # index is used as first column drinks.to_csv('drinks_updated.csv', index=False) # ignore index # save a DataFrame to disk (aka 'pickle') and read it from disk (aka 'unpickle') drinks.to_pickle('drinks_pickle') pd.read_pickle('drinks_pickle') # randomly sample a DataFrame train = drinks.sample(frac=0.75, random_state=1) # will contain 75% of the rows test = drinks[~drinks.index.isin(train.index)] # will contain the other 25% # change the maximum number of rows and columns printed ('None' means unlimited) pd.set_option('max_rows', None) # default is 60 rows pd.set_option('max_columns', None) # default is 20 columns print drinks # reset options to defaults pd.reset_option('max_rows') pd.reset_option('max_columns') # change the options temporarily (settings are restored when you exit the 'with' block) with pd.option_context('max_rows', None, 'max_columns', None): print drinks
from generate_counterfactuals import generate_counterfactuals from search_utils.Query import Query from search_utils.Sentence import Sentence model_config.load("imdb", evalution_model="gpt2") num = 5 result = [] for wanted_positivity in range(num + 1): wanted_positivity = wanted_positivity / num wanted_cls = [(1 - wanted_positivity), wanted_positivity] max_delta = 50. / num / 100. print(f"{wanted_cls[1]}+-{max_delta}") # relative high consider_max_words becasue max_delta is small. # sent = "A decent story with some thrilling action scenes." # sent = "the year's best and most unpredictable comedy." sent = "an extremely unpleasant film." r = generate_counterfactuals( sent, Query(wanted_cls=wanted_cls, max_delta=max_delta)) print(r.examples[0][0] if len(r.examples) > 0 else "----") result.append({ "y'": f"{wanted_cls[1]:.1f} pm {max_delta:.1f}", "y": f"{r.examples[0][0].cls[1]:.2f}", "Counterfactual Example x' ": r.examples[0][0].sentence }) print("######") print(f"Original cls {Sentence(sent).calc_sentiment()[1]}") with pd.option_context("max_colwidth", 1000): print(pd.DataFrame(result).to_latex(index=False))
def test_dt_namespace_accessor(self): # GH 7207, 11128 # test .dt namespace accessor ok_for_base = [ 'year', 'month', 'day', 'hour', 'minute', 'second', 'weekofyear', 'week', 'dayofweek', 'weekday', 'dayofyear', 'quarter', 'freq', 'days_in_month', 'daysinmonth', 'is_leap_year' ] ok_for_period = ok_for_base + ['qyear', 'start_time', 'end_time'] ok_for_period_methods = ['strftime', 'to_timestamp', 'asfreq'] ok_for_dt = ok_for_base + [ 'date', 'time', 'microsecond', 'nanosecond', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end', 'tz', 'weekday_name' ] ok_for_dt_methods = [ 'to_period', 'to_pydatetime', 'tz_localize', 'tz_convert', 'normalize', 'strftime', 'round', 'floor', 'ceil', 'weekday_name' ] ok_for_td = ['days', 'seconds', 'microseconds', 'nanoseconds'] ok_for_td_methods = [ 'components', 'to_pytimedelta', 'total_seconds', 'round', 'floor', 'ceil' ] def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name) def compare(s, name): a = getattr(s.dt, prop) b = get_expected(s, prop) if not (is_list_like(a) and is_list_like(b)): self.assertEqual(a, b) else: tm.assert_series_equal(a, b) # datetimeindex cases = [ Series(date_range('20130101', periods=5), name='xxx'), Series(date_range('20130101', periods=5, freq='s'), name='xxx'), Series(date_range('20130101 00:00:00', periods=5, freq='ms'), name='xxx') ] for s in cases: for prop in ok_for_dt: # we test freq below if prop != 'freq': compare(s, prop) for prop in ok_for_dt_methods: getattr(s.dt, prop) result = s.dt.to_pydatetime() self.assertIsInstance(result, np.ndarray) self.assertTrue(result.dtype == object) result = s.dt.tz_localize('US/Eastern') exp_values = DatetimeIndex(s.values).tz_localize('US/Eastern') expected = Series(exp_values, index=s.index, name='xxx') tm.assert_series_equal(result, expected) tz_result = result.dt.tz self.assertEqual(str(tz_result), 'US/Eastern') freq_result = s.dt.freq self.assertEqual(freq_result, DatetimeIndex(s.values, freq='infer').freq) # let's localize, then convert result = s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') exp_values = (DatetimeIndex( s.values).tz_localize('UTC').tz_convert('US/Eastern')) expected = Series(exp_values, index=s.index, name='xxx') tm.assert_series_equal(result, expected) # round s = Series(pd.to_datetime([ '2012-01-01 13:00:00', '2012-01-01 12:01:00', '2012-01-01 08:00:00' ]), name='xxx') result = s.dt.round('D') expected = Series(pd.to_datetime( ['2012-01-02', '2012-01-02', '2012-01-01']), name='xxx') tm.assert_series_equal(result, expected) # round with tz result = ( s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern').dt.round('D')) exp_values = pd.to_datetime(['2012-01-01', '2012-01-01', '2012-01-01']).tz_localize('US/Eastern') expected = Series(exp_values, name='xxx') tm.assert_series_equal(result, expected) # floor s = Series(pd.to_datetime([ '2012-01-01 13:00:00', '2012-01-01 12:01:00', '2012-01-01 08:00:00' ]), name='xxx') result = s.dt.floor('D') expected = Series(pd.to_datetime( ['2012-01-01', '2012-01-01', '2012-01-01']), name='xxx') tm.assert_series_equal(result, expected) # ceil s = Series(pd.to_datetime([ '2012-01-01 13:00:00', '2012-01-01 12:01:00', '2012-01-01 08:00:00' ]), name='xxx') result = s.dt.ceil('D') expected = Series(pd.to_datetime( ['2012-01-02', '2012-01-02', '2012-01-02']), name='xxx') tm.assert_series_equal(result, expected) # datetimeindex with tz s = Series(date_range('20130101', periods=5, tz='US/Eastern'), name='xxx') for prop in ok_for_dt: # we test freq below if prop != 'freq': compare(s, prop) for prop in ok_for_dt_methods: getattr(s.dt, prop) result = s.dt.to_pydatetime() self.assertIsInstance(result, np.ndarray) self.assertTrue(result.dtype == object) result = s.dt.tz_convert('CET') expected = Series(s._values.tz_convert('CET'), index=s.index, name='xxx') tm.assert_series_equal(result, expected) tz_result = result.dt.tz self.assertEqual(str(tz_result), 'CET') freq_result = s.dt.freq self.assertEqual(freq_result, DatetimeIndex(s.values, freq='infer').freq) # timedeltaindex cases = [ Series(timedelta_range('1 day', periods=5), index=list('abcde'), name='xxx'), Series(timedelta_range('1 day 01:23:45', periods=5, freq='s'), name='xxx'), Series(timedelta_range('2 days 01:23:45.012345', periods=5, freq='ms'), name='xxx') ] for s in cases: for prop in ok_for_td: # we test freq below if prop != 'freq': compare(s, prop) for prop in ok_for_td_methods: getattr(s.dt, prop) result = s.dt.components self.assertIsInstance(result, DataFrame) tm.assert_index_equal(result.index, s.index) result = s.dt.to_pytimedelta() self.assertIsInstance(result, np.ndarray) self.assertTrue(result.dtype == object) result = s.dt.total_seconds() self.assertIsInstance(result, pd.Series) self.assertTrue(result.dtype == 'float64') freq_result = s.dt.freq self.assertEqual(freq_result, TimedeltaIndex(s.values, freq='infer').freq) # both index = date_range('20130101', periods=3, freq='D') s = Series(date_range('20140204', periods=3, freq='s'), index=index, name='xxx') exp = Series(np.array([2014, 2014, 2014], dtype='int64'), index=index, name='xxx') tm.assert_series_equal(s.dt.year, exp) exp = Series(np.array([2, 2, 2], dtype='int64'), index=index, name='xxx') tm.assert_series_equal(s.dt.month, exp) exp = Series(np.array([0, 1, 2], dtype='int64'), index=index, name='xxx') tm.assert_series_equal(s.dt.second, exp) exp = pd.Series([s[0]] * 3, index=index, name='xxx') tm.assert_series_equal(s.dt.normalize(), exp) # periodindex cases = [ Series(period_range('20130101', periods=5, freq='D'), name='xxx') ] for s in cases: for prop in ok_for_period: # we test freq below if prop != 'freq': compare(s, prop) for prop in ok_for_period_methods: getattr(s.dt, prop) freq_result = s.dt.freq self.assertEqual(freq_result, PeriodIndex(s.values).freq) # test limited display api def get_dir(s): results = [r for r in s.dt.__dir__() if not r.startswith('_')] return list(sorted(set(results))) s = Series(date_range('20130101', periods=5, freq='D'), name='xxx') results = get_dir(s) tm.assert_almost_equal( results, list(sorted(set(ok_for_dt + ok_for_dt_methods)))) s = Series( period_range('20130101', periods=5, freq='D', name='xxx').asobject) results = get_dir(s) tm.assert_almost_equal( results, list(sorted(set(ok_for_period + ok_for_period_methods)))) # 11295 # ambiguous time error on the conversions s = Series(pd.date_range('2015-01-01', '2016-01-01', freq='T'), name='xxx') s = s.dt.tz_localize('UTC').dt.tz_convert('America/Chicago') results = get_dir(s) tm.assert_almost_equal( results, list(sorted(set(ok_for_dt + ok_for_dt_methods)))) exp_values = pd.date_range('2015-01-01', '2016-01-01', freq='T', tz='UTC').tz_convert('America/Chicago') expected = Series(exp_values, name='xxx') tm.assert_series_equal(s, expected) # no setting allowed s = Series(date_range('20130101', periods=5, freq='D'), name='xxx') with tm.assertRaisesRegexp(ValueError, "modifications"): s.dt.hour = 5 # trying to set a copy with pd.option_context('chained_assignment', 'raise'): def f(): s.dt.hour[0] = 5 self.assertRaises(com.SettingWithCopyError, f)
def compute_portvals(start_date, end_date, orders_file, start_val): """Compute daily portfolio value given a sequence of orders in a CSV file. Parameters ---------- start_date: first date to track end_date: last date to track orders_file: CSV file to read orders from start_val: total starting cash available Returns ------- portvals: portfolio value for each trading day from start_date to end_date (inclusive) """ # TODO: Your code here symbols = list() rowCount = 0 startDate = 0 endDate = 0 lastRow = [] reader = csv.reader(open(orders_file, 'rU'), delimiter=',') for row in reader: if rowCount > 0: if rowCount == 1: startDate = row[0] symbols.append(row[1]) rowCount += 1 lastRow = row orders = pd.read_csv(orders_file, index_col='Date', parse_dates=True, na_values=['nan']) print orders endDate = lastRow[0] uniqueList = list(set(symbols)) dates = pd.date_range(start_date, end_date) prices_all = get_data(uniqueList, dates) # automatically adds SPY prices_all['CASH'] = 1.0 ordersDF = prices_all.copy(deep=True) for item in uniqueList: ordersDF[item] = 0 ordersDF['CASH'] = 0.0 ordersDF = ordersDF.drop('SPY', 1) for index, row in orders.iterrows(): shares = row[2] if row[1] == 'SELL': row[2] = -1 * row[2] if index in ordersDF.index: testValue = ordersDF.get_value(index, row[0]) setValue = row[2] if testValue: setValue = setValue + testValue ordersDF.set_value(index, row[0], setValue) #print ordersDF for index, row in ordersDF.iterrows(): rowTotal = 0 for item in uniqueList: price = prices_all.get_value(index, item) rowTotal += row[item] * price * -1 #print rowTotal ordersDF.set_value(index, 'CASH', rowTotal) #print ordersDF holdingsDF = ordersDF.copy(deep=True) holdingsDF['CASH'] = 0.0 holdingsDF.set_value(start_date, 'CASH', start_val) loopRow = holdingsDF.iterrows() holdingsDF.set_value(start_date, 'CASH', start_val) prevValue = start_val for index, row in loopRow: sharesValue = ordersDF.get_value(index, 'CASH') holdingsDF.set_value(index, 'CASH', prevValue + sharesValue) prevValue = prevValue + sharesValue #print holdingsDF loopRow = holdingsDF.iterrows() previousValues = {} for item in uniqueList: previousValues[item] = 0 for index, row in loopRow: for item in uniqueList: holdingsDF.set_value(index, item, row[item] + previousValues[item]) previousValues[item] = row[item] + previousValues[item] dfValues = holdingsDF.copy(deep=True) dfValues = dfValues.drop('CASH', 1) dfValues['VALUES'] = 0.0 loopRow = holdingsDF.iterrows() for index, row in loopRow: total = 0 for item in uniqueList: holdingsValue = holdingsDF.get_value(index, item) pricesValue = prices_all.get_value(index, item) total += holdingsValue * pricesValue dfValues.set_value(index, 'VALUES', total) #print dfValues finalDF = dfValues.copy(deep=True) for item in uniqueList: finalDF = finalDF.drop(item, 1) #print dfValues finalDF = finalDF.drop('VALUES', 1) finalDF['TOTALS'] = 0 finalDF['TOTALS'] = dfValues['VALUES'] + holdingsDF['CASH'] with pd.option_context('display.max_rows', 999, 'display.max_columns', 5): print finalDF['TOTALS'] return finalDF['TOTALS']
def fullDisplay(df,max_rows=None,max_col=None,width=None): df_cp = df.style.set_properties( **{'width': f'{width}px'}) if width is not None else df.copy() with pd.option_context('display.max_rows', max_rows, 'display.max_columns', max_col,): display(df_cp)
def list_trials(experiment_path, sort=None, output=None, filter_op=None, info_keys=None, result_keys=None): """Lists trials in the directory subtree starting at the given path. Args: experiment_path (str): Directory where trials are located. Corresponds to Experiment.local_dir/Experiment.name. sort (str): Key to sort by. output (str): Name of file where output is saved. filter_op (str): Filter operation in the format "<column> <operator> <value>". info_keys (list): Keys that are displayed. result_keys (list): Keys of last result that are displayed. """ _check_tabulate() experiment_state = _get_experiment_state(experiment_path, exit_on_fail=True) checkpoint_dicts = experiment_state["checkpoints"] checkpoint_dicts = [flatten_dict(g) for g in checkpoint_dicts] checkpoints_df = pd.DataFrame(checkpoint_dicts) if not info_keys: info_keys = DEFAULT_EXPERIMENT_INFO_KEYS if not result_keys: result_keys = DEFAULT_RESULT_KEYS result_keys = ["last_result:{}".format(k) for k in result_keys] col_keys = [ k for k in list(info_keys) + result_keys if k in checkpoints_df ] checkpoints_df = checkpoints_df[col_keys] if "last_update_time" in checkpoints_df: with pd.option_context("mode.use_inf_as_null", True): datetime_series = checkpoints_df["last_update_time"].dropna() datetime_series = datetime_series.apply( lambda t: datetime.fromtimestamp(t).strftime(TIMESTAMP_FORMAT)) checkpoints_df["last_update_time"] = datetime_series if "logdir" in checkpoints_df: # logdir often too verbose to view in table, so drop experiment_path checkpoints_df["logdir"] = checkpoints_df["logdir"].str.replace( experiment_path, "") if filter_op: col, op, val = filter_op.split(" ") col_type = checkpoints_df[col].dtype if is_numeric_dtype(col_type): val = float(val) elif is_string_dtype(col_type): val = str(val) # TODO(Andrew): add support for datetime and boolean else: raise ValueError("Unsupported dtype for \"{}\": {}".format( val, col_type)) op = OPERATORS[op] filtered_index = op(checkpoints_df[col], val) checkpoints_df = checkpoints_df[filtered_index] if sort: if sort not in checkpoints_df: raise KeyError("Sort Index \"{}\" not in: {}".format( sort, list(checkpoints_df))) checkpoints_df = checkpoints_df.sort_values(by=sort) print_format_output(checkpoints_df) if output: file_extension = os.path.splitext(output)[1].lower() if file_extension in (".p", ".pkl", ".pickle"): checkpoints_df.to_pickle(output) elif file_extension == ".csv": checkpoints_df.to_csv(output, index=False) else: raise ValueError("Unsupported filetype: {}".format(output)) print("Output saved at:", output)
def main(): HistoPlot(year20['W_L_percent'], 10, 'Winning Percentage by Team', 'Distribution of Team Winning Percentage', False, 0, 7) HistoPlot(year20['ERA'], 10, 'Earned Run Average', 'Distribution of 2020 Team Earned Run Average', False, 0, 7) HistoPlot(year20['RBI'], 10, 'Runs Batted In', 'Distribution of 2020 Team RBIs', False, 0, 6) HistoPlot(year20['TB'], 10, 'Total Bases Achieved', 'Distribution of 2020 Total Bases Achieved by Team', False, 0, 7) HistoPlot(year20['TotalRuns'], 10, 'Total Runs Scored by a Team', 'Total Runs Scored per Team 2020', False, 0, 6) HistoPlot(year20['BA'], 10, 'Team Batting Average', 'Distribution of Team Batting Average for 2020', False, 0, 8) HistoPlot(year20['SLG'], 10, 'Total Slugging Percentage', 'Distribution of Team Slugging Percentage for 2020', False, 0, 6) HistoPlot(year20['WHIP'], 10, 'Walks & Hits Per Innining', 'Walks and Hits per Inning Pitched 2020 Distribution', False, 0, 7) HistoPlot(year20['OBP'], 10, 'On-Base Percentage', '2020 On-Base Percentage of Teams Distribution', False, 0, 8) # Representation of some Outliers in the Data print('2020 Team(s) with an ERA greater than 5.2:') print(year20[year20.ERA > 5.2][['Rk', 'Tm']]) print('\n') print('2020 Team(s) with on On-Base Percentage less than .300:') print(year20[year20.OBP < .3][['Rk', 'Tm']]) print('\n') print('2020 Team(s) with less than 225 RBIs:') print(year20[year20.RBI < 225][['Rk', 'Tm']]) print('\n') print('2020 Team(s) with more than 325 RBIs:') print(year20[year20.RBI > 325][['Rk', 'Tm']]) print('\n') print('2020 Team(s) with more than 330 Total Runs Scored:') print(year20[year20.TotalRuns > 330][['Rk', 'Tm']]) print('\n') print( '2020 Team(s) with less than 1.19 Walks and Hits per Inning Pitched:') print(year20[year20.WHIP < 1.19][['Rk', 'Tm']]) print('\n') print('2020 Team(s) with more than 1.5 ') print(year20[year20.WHIP > 1.5][['Rk', 'Tm']]) print('\n') print('2020 Team(s) with more than 950 Total Bases Achieved:') print(year20[year20.TB > 950][['Rk', 'Tm']]) print('\n') print('Season - 2020:' + '\n') print('League rank for the outliers on the "Home Runs" histogram plot,' + '\n' + 'Home Runs by a Team greater than 100: ' + str(year20[year20.HR_x > 100]['Rk'].values) + '\n') print( 'League rank for the outliers on the "Runs Batted In" histogram plot' + '\n' + 'Total Team RBIs less than 225: ' + str(year20[year20.RBI < 225]['Rk'].values) + '\n') print('League rank for the outliers on the "Total Bases" histogram plot' + '\n' + 'Total Bases Completed by a Team greater than 950: ' + str(year20[year20.TB > 950]['Rk'].values) + '\n') print( 'League rank for the outliers on the "Earned Run Average" histogram plot' + '\n' + 'Team ERA Average greater than 5.2: ' + str(year20[year20.ERA > 5.2]['Rk'].values)) print('\n') print('Season - 2019:' + '\n') print('League rank for the outliers on the "Home Runs" histogram plot,' + '\n' + 'Home Runs by a Team less than 100: ' + str(year19[year19.HR_x < 199]['Rk'].values) + '\n') print('League rank for the outliers on the "Runs Scored" histogram plot,' + '\n' + 'Runs Scored by a Team less than 650: ' + str(year19[year19.RunsPerGame < 650]['Rk'].values) + '\n') year20['ERA2'] = year20.ERA**2 year20['ERA3'] = year20.ERA**3 df_z = year20.select_dtypes(include=[np.number]).dropna().apply(st.zscore) formula = 'W_L_percent ~ ERA + RBI + TB' year20_model = smf.ols(formula, data=year20) result = smf.ols(formula, data=df_z).fit() year20_results = year20_model.fit() print(year20_results.summary()) print(result.summary()) print('\n') with pd.option_context('display.max_rows', None, 'display.max_columns', None): print('Wonders of Rank 14?:' + '\n' + str(year19[ year19.Rk == 14][['Tm', 'ERA', 'TB', 'WHIP', 'RBI', 'OBP']])) print('\n') with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(year19[year19.Tm == 'LgAvg'][[ 'Rk', 'Tm', 'ERA', 'TB', 'WHIP', 'RBI', 'OBP' ]]) print('\n') print('2020 Variables Averages between the American and National Leagues:') print(year20[['Lg', 'TotalRuns', 'BA', 'ERA', 'OBP']].groupby('Lg').mean()) iloc_numbers1 = [6, 8, 9, 20, 21, 24, 26] iloc_numbers2 = [6, 27, 28, 29, 30, 33, 44] iloc_numbers3 = [6, 52, 67, 71, 72, 76, 84, 86] ComparingPlot( 'W_L_percent', 10, 'Distribution of Winning Percentage from the 2017-2020 Seasons', 'Winning %', 'Winning Percentage (%)', 0.05, 0.2) ComparingPlot('ERA', 10, 'Distribution of Earned Run Average, 2017-2020 Seasons', 'ERA', 'Earned Run Average', 0.1, 0.1) ComparingPlot( 'BA', 10, 'Comparing Team Batting Averages from the 2017-2020 Seasons', 'Batting Average', 'Team Batting Average', 0.1, 0.05) CorrelationMatrix(year20, iloc_numbers1) print('\n') CorrelationMatrix(year20, iloc_numbers2) print('\n') CorrelationMatrix(year20, iloc_numbers3) print('\n') ALvsNLcomp20() print('\n') print('\n') print('2020 AL Slugging Percentage Average: ' + str(round(np.mean(AL20['SLG']), 3))) print('2020 NL Slugging Percentage Average: ' + str(round(np.mean(NL20['SLG']), 3))) print('\n') print('2020 AL Earned Run Average: ' + str(round(np.mean(AL20['ERA']), 3))) print('2020 NL Earned Run Average: ' + str(round(np.mean(NL20['ERA']), 3))) print('\n') ALvsNLcomp19() print('\n') print('2019 AL Slugging Percentage Average: ' + str(round(np.mean(AL19['SLG']), 3))) print('2019 NL Slugging Percentage Average: ' + str(round(np.mean(NL19['SLG']), 3))) print('\n') print('2019 AL Earned Run Average: ' + str(round(np.mean(AL19['ERA']), 3))) print('2019 NL Earned Run Average: ' + str(round(np.mean(NL19['ERA']), 3))) print('\n') TestCDF('WHIP', 'Walks & Hits (per Inning Pitched)', 'Cumulative Distribution Function of WHIP by Team') TestCDF('BA', 'Team Batting Average', 'Cumulative Distribution Function of Team Batting Average') TestCDF('SLG', 'Slugging Percentage (% per Team)', 'Cumulative Distribution Function of Team Slugging Percentage') TestCDF('ERA', 'Opponent Earned Runs Average (per 9 Innings)', 'Cumulative Distribution Function of Earned Run Average') TestCDF('OBP', 'On-Base Percentage (%)', 'Cumulative Distribution Function of On-Base Percentage by Team') TestCDF('RunsPGame', 'Runs Per Game', 'Cumulative Distribution Function of Runs Per Game') TestCDF('SO_x', 'Strikeouts', 'Cumulative Distribution Function of Runs Per Game') OPS_cdf(log=False) print('\n') OPS_cdf(log=True) print('\n') ALvsNL('ERA') print('\n') ALvsNL('SLG') print('\n') ALvsNL('OBP') print('\n') ALvsNL('BA') print('\n') ALvsNL('WHIP') print('\n') ALvsNL('SO9') print('\n') ALvsNL('TB') print('\n') ALvsNL('SO_x') print('\n') era17_19 = year19['ERA'].append(year18['ERA']).append(year17['ERA']) year_era_test = st.ttest_ind(year20['ERA'], era17_19, equal_var=False) ba17_19 = year19['BA'].append(year18['BA']).append(year17['BA']) year_ba_test = st.ttest_ind(year20['BA'], ba17_19, equal_var=False) slg17_19 = year19['SLG'].append(year18['SLG']).append(year17['SLG']) year_slg_test = st.ttest_ind(year20['SLG'], slg17_19, equal_var=False) print('BA') print(year_ba_test) print('\n') print('SLG') print(year_slg_test) print('\n') print('ERA') print(year_era_test) print('\n') # Comparing Comparing2020('BA') print('\n') Comparing2020('SLG') print('\n') Comparing2020('ERA') print('\n') Comparing2020('WHIP') print('\n') # Correlation Plot and Calculations between variables ScatterPlot('SO9', 'W_L_percent', 'Importance of Strikeouts and Winning Percentage', 'Strikeouts (per 9 Innings)', 'Winning Percentage (%)') print('\n') ScatterPlot('RunsPGame', 'W_L_percent', 'Score Runs and Winning Games', 'Runs Scored (per Game)', 'Winning Percentage (%)') print('\n') ScatterPlot('OBP', 'W_L_percent', 'Comparison between On-Base Percentage and Winning', 'On-Base Percentage (%)', 'Winning Percentage (%)') print('\n') ScatterPlot('ERA', 'W_L_percent', 'Pitchers Abilites to Win Games (ERA vs Winning)', 'Earned Run Average', 'Winning Percentage (%)') print('\n') ScatterPlot('ERA', 'SO9', 'Pitchers Abilites to Win Games (ERA vs Winning)', 'Earned Run Average', 'Winning Percentage (%)') print('\n') ScatterPlot('ERA', 'WHIP', 'Pitchers Abilites to Win Games (ERA vs Winning)', 'Earned Run Average', 'Winning Percentage (%)') print('\n') ScatterPlot('WHIP', 'SO9', 'Pitchers Abilites to Win Games (ERA vs Winning)', 'Earned Run Average', 'Winning Percentage (%)') print('\n') ScatterPlot('TB', 'W_L_percent', 'Pitchers Abilites to Win Games (ERA vs Winning)', 'Earned Run Average', 'Winning Percentage (%)') print('\n') ScatterPlot('BA', 'W_L_percent', 'Comparing Batting Average and Winning %', 'Batting Average', 'Winning Percentage (%)') print('\n') ScatterPlot('WHIP', 'W_L_percent', 'WHIP vs Winning Percentage', 'Walks & Hits per Inning Pitched', 'Winning Percentage (%)') print('\n') ScatterPlot('RunsPGame', 'OBP', 'On-Base Percentage\'s Influence on Runs', 'Runs Scored (per Game)', 'On-Base Percentage (%)')