示例#1
0
def _show(rows, n, cols):
    """Printing to a screen or saving to a file

    rows: iterator of Row instances
    n: maximum number of lines to show
    cols:  columns to show
    """
    # so that you can easily maintain code
    # Searching nrows is easier than searching n in editors
    nrows = n

    if cols:
        rows = _pick(cols, rows)

    row0, rows1 = peek_first(rows)
    cols = row0.columns
    seq_values = _safe_values(rows1, cols)

    with pd.option_context("display.max_rows", nrows), \
            pd.option_context("display.max_columns", 1000):
        # make use of pandas DataFrame displaying
        # islice 1 more rows than required
        # to see if there are more rows left
        list_values = list(islice(seq_values, nrows + 1))
        print(pd.DataFrame(list_values[:nrows], columns=cols))
        if len(list_values) > nrows:
            print("...more rows...")
示例#2
0
    def test_info_max_cols(self):
        df = DataFrame(np.random.randn(10, 5))
        for len_, verbose in [(5, None), (5, False), (10, True)]:
            # For verbose always      ^ setting  ^ summarize ^ full output
            with option_context('max_info_columns', 4):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

        for len_, verbose in [(10, None), (5, False), (10, True)]:

            # max_cols no exceeded
            with option_context('max_info_columns', 5):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

        for len_, max_cols in [(10, 5), (5, 4)]:
            # setting truncates
            with option_context('max_info_columns', 4):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

            # setting wouldn't truncate
            with option_context('max_info_columns', 5):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)
示例#3
0
    def test_publishes(self):
        df = pd.DataFrame({"A": [1, 2]})
        objects = [df['A'], df, df]  # dataframe / series
        expected_keys = [
            {'text/plain', 'application/vnd.dataresource+json'},
            {'text/plain', 'text/html', 'application/vnd.dataresource+json'},
        ]

        make_patch = self.mock.patch('IPython.display.display')
        opt = pd.option_context('display.html.table_schema', True)
        for obj, expected in zip(objects, expected_keys):
            with opt, make_patch as mock_display:
                handle = obj._ipython_display_()
                self.assertEqual(mock_display.call_count, 1)
                self.assertIsNone(handle)
                args, kwargs = mock_display.call_args
                arg, = args  # just one argument

            self.assertEqual(kwargs, {"raw": True})
            self.assertEqual(set(arg.keys()), expected)

        with_latex = pd.option_context('display.latex.repr', True)

        with opt, with_latex, make_patch as mock_display:
            handle = obj._ipython_display_()
            args, kwargs = mock_display.call_args
            arg, = args

        expected = {'text/plain', 'text/html', 'text/latex',
                    'application/vnd.dataresource+json'}
        self.assertEqual(set(arg.keys()), expected)
示例#4
0
    def test_repr_dimensions(self):
        df = DataFrame([[1, 2, ], [3, 4]])
        with option_context('display.show_dimensions', True):
            self.assertTrue("2 rows x 2 columns" in repr(df))

        with option_context('display.show_dimensions', False):
            self.assertFalse("2 rows x 2 columns" in repr(df))

        with option_context('display.show_dimensions', 'truncate'):
            self.assertFalse("2 rows x 2 columns" in repr(df))
示例#5
0
def ent_ri_upsell_opp(df, include_rds=False):
	# Varaibles to store Tables and Images
	tables=[]
	images=[]
	# Get todays date
	today = datetime.today().date()
	last_month = date(today.year, today.month-1, 1)
	# Group by AR Period
	df = df.groupby('AR Period')
	df = df.get_group(last_month)
	# Group By Territory
	by_ter = df.groupby(['Territory'])
	# Empty DataFrame to Store Overview
	ec2_overview = pd.DataFrame(columns=('Territory', 'Total Revenue', 'Total EC2', 'OnDemand', 'Av. % Optimised', 'Partial RI Opp.', 'Full RI Opp.'))
	if include_rds:
		rds_overview = pd.DataFrame(columns=('Territory', 'Total Revenue', 'Total RDS', 'OnDemand', 'Av. % Optimised', 'Partial RI Opp.', 'Full RI Opp.'))
	# For each territory
	for territory, data in by_ter:
		tbl, img, over = ent_ri_ec2_upsell(data, territory)
		tables += tbl
		images += img
		ec2_overview = ec2_overview.append(over)
		if include_rds:
			tbl, img, over = ent_ri_rds_upsell(data, territory)
			tables += tbl
			images += img
			rds_overview = rds_overview.append(over)
	# Include EC2 Overview Table
	ec2_overview = ec2_overview[['Territory', 'Total Revenue', 'Total EC2', 'OnDemand', 'Av. % Optimised', 'Partial RI Opp.', 'Full RI Opp.']].sort('Partial RI Opp.', ascending=False)
	ec2_overview.rename(columns={'Av. % Optimised': 'Av. \% Optimised'}, inplace=True)
	pretty_overview = ec2_overview.apply(add_commas_df)
	pretty_overview['Territory'] = pretty_overview['Territory'].apply(add_href)
	with pd.option_context("max_colwidth", 1000):
		pretty_overview.to_latex('ent-ec2-ri-upsell-overview.tex', index=False, na_rep="Unknown", escape=False)
	table= {'name':"\\textbf{Ent EC2 RI Upsell Opportunity Overview (%s)}"%last_month.strftime("%b-%y"),\
		'file':'ent-ec2-ri-upsell-overview','section':'Overview'}
	tables.append(table)
	# Include RDS Overview Table
	if include_rds:
		rds_overview = rds_overview[['Territory', 'Total Revenue', 'Total RDS', 'OnDemand', 'Av. % Optimised', 'Partial RI Opp.', 'Full RI Opp.']].sort('Partial RI Opp.', ascending=False)
		rds_overview.rename(columns={'Av. % Optimised': 'Av. \% Optimised'}, inplace=True)
		pretty_overview = rds_overview.apply(add_commas_df)
		pretty_overview['Territory'] = pretty_overview['Territory'].apply(add_href)
		with pd.option_context("max_colwidth", 1000):
			pretty_overview.to_latex('ent-rds-ri-upsell-overview.tex', index=False, na_rep="Unknown", escape=False)
		table= {'name':"\\textbf{Ent RDS RI Upsell Opportunity Overview (%s)}"%last_month.strftime("%b-%y"),\
			'file':'ent-rds-ri-upsell-overview','section':'Overview'}
		tables.append(table)

	return tables, images
示例#6
0
def correlate_operators(operator_so, operator_si, verbose):

    # inner merge to get linear combinations of contributing correlation functions
    lattice_operators = pd.merge(operator_so, operator_si,
                                 how='inner', left_index=True, right_index=True,
                                 suffixes=['_{so}', '_{si}'])

    lattice_operators['coefficient'] = lattice_operators['coefficient_{so}'].apply(np.conj) \
        * lattice_operators['coefficient_{si}']
    lattice_operators.drop(
        ['coefficient_{so}', 'coefficient_{si}'], axis=1, inplace=True)

    lattice_operators.reset_index(inplace=True)
    index = lattice_operators.columns.difference(['coefficient']).tolist()
    order = {r'Irrep': 0,
             r'mult': 1,
             r'p_{cm}': 2,
             r'operator_label_{so}': 3,
             r'operator_label_{si}': 4,
             r'\mu': 5,
             r'\beta': 6,
             r'q_{so}': 7,
             r'q_{si}': 8,
             r'p^{0}_{so}': 9,
             r'p^{1}_{so}': 10,
             r'p^{0}_{si}': 11,
             r'p^{1}_{si}': 12,
             r'\gamma^{0}_{so}': 13,
             r'\gamma^{1}_{so}': 14,
             r'\gamma^{0}_{si}': 15,
             r'\gamma^{1}_{si}': 16}
    index = sorted(index, key=lambda x: order[x])
    lattice_operators.set_index(index, inplace=True)

    lattice_operators = lattice_operators.sum(axis=0, level=index)

    lattice_operators = lattice_operators[lattice_operators['coefficient'] != 0]

    if verbose >= 1:
        print 'lattice_operators'
    if verbose == 1:
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            print lattice_operators.head()
    if verbose >= 2:
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            print lattice_operators

    return lattice_operators
示例#7
0
    def test_unicode_print(self):
        c = Categorical(['aaaaa', 'bb', 'cccc'] * 20)
        expected = """\
[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc]
Length: 60
Categories (3, object): [aaaaa, bb, cccc]"""

        assert repr(c) == expected

        c = Categorical(['ああああ', 'いいいいい', 'ううううううう'] * 20)
        expected = """\
[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
Length: 60
Categories (3, object): [ああああ, いいいいい, ううううううう]"""  # noqa

        assert repr(c) == expected

        # unicode option should not affect to Categorical, as it doesn't care
        # the repr width
        with option_context('display.unicode.east_asian_width', True):

            c = Categorical(['ああああ', 'いいいいい', 'ううううううう'] * 20)
            expected = """[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
Length: 60
Categories (3, object): [ああああ, いいいいい, ううううううう]"""  # noqa

            assert repr(c) == expected
示例#8
0
def main(args):
    df = pd.DataFrame(index=args.directories, columns=["sentences", "tokens", "nodes", "discontinuous", "reentrant",
                                                       "implicit", "edges", "primary", "remote"])
    df.fillna(0, inplace=True)
    for i, directory in enumerate(args.directories):
        row = df.loc[directory]
        for passage in get_passages_with_progress_bar(directory, desc=directory):
            l1 = passage.layer(layer1.LAYER_ID)
            non_terminals = [n for n in l1.all if n not in l1.heads and len(n.get_terminals()) > 1]
            edges = {e for n in non_terminals for e in n}
            remote_counter = Counter(e.attrib.get("remote", False) for e in edges)
            row["sentences"] += 1
            row["tokens"] += len(passage.layer(layer0.LAYER_ID).all)
            row["nodes"] += len(non_terminals)
            row["discontinuous"] += sum(1 for n in non_terminals if n.discontiguous)
            row["reentrant"] += sum(1 for n in non_terminals if any(e.attrib.get("remote") for e in n.incoming))
            row["edges"] += len(edges)
            row["primary"] += remote_counter[False]
            row["remote"] += remote_counter[True]
            row["implicit"] += sum(1 for n in l1.all if n.attrib.get("implicit"))

    # Change to percentages
    df["discontinuous"] *= 100. / df["nodes"]
    df["reentrant"] *= 100. / df["nodes"]
    df["implicit"] *= 100. / df["nodes"]
    df["primary"] *= 100. / df["edges"]
    df["remote"] *= 100. / df["edges"]

    # Print
    if args.outfile:
        df.T.to_csv(args.outfile, float_format="%.2f", sep="&", line_terminator=" \\\\\n")
        print("Saved to " + args.outfile)
    else:
        with pd.option_context("display.max_rows", None, "display.max_columns", None):
            print(df.T)
示例#9
0
def earncost(): 
    if request.method=='GET':
        earntarget = 60000
        tuitiontarget = 10000
        nn = 5
    else:
        earntarget = int(request.form['earnings'])
        tuitiontarget = int(request.form['tuition'])
        nn = int(request.form['viewsize'])
    # run function to get tables
    cols = df.columns
    dfs = {}
    for percent, col in zip(['50%', '25%', '10%'], cols[3:]):
        outcols = list(cols[:3])
        outcols.append(col) # inst, state, tuition, earnings
        dfres = nncalc(outcols, earntarget, tuitiontarget, nn)
        dfres.columns = ['Institution', 'State', 'Annual Tuition', 'Reported Earnings']
        dfres.sort_values('Annual Tuition', inplace=True, ascending=True)
        with pd.option_context('max_colwidth', -1):
            testhtml = dfres.to_html(index=False, escape=False, 
            classes='table table-condensed table-striped table-bordered')
        testhtml = testhtml.replace('border="1" ', '').replace('class="dataframe ', 'class="')
        testhtml = testhtml.replace(' style="text-align: right;"', '').replace('&', '&')
        dfs[percent] = testhtml

    # modification date
    updated = moddate()

    return render_template('earncost.html', updated=updated, dfs=dfs,
                            earnings=earntarget, tuition=tuitiontarget, viewsize=nn)
示例#10
0
 def test_repr_max_seq_item_setting(self):
     # GH10182
     idx = self.create_index()
     idx = idx.repeat(50)
     with pd.option_context("display.max_seq_items", None):
         repr(idx)
         self.assertFalse('...' in str(idx))
示例#11
0
def test_ignore_display_max_colwidth(method, expected, max_colwidth):
    # see gh-17004
    df = DataFrame([lorem_ipsum])
    with pd.option_context('display.max_colwidth', max_colwidth):
        result = getattr(df, method)()
    expected = expected(max_colwidth)
    assert expected in result
示例#12
0
def _split_symbol_mappings(df, exchanges):
    """Split out the symbol: sid mappings from the raw data.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe with multiple rows for each symbol: sid pair.
    exchanges : pd.DataFrame
        The exchanges table.

    Returns
    -------
    asset_info : pd.DataFrame
        The asset info with one row per asset.
    symbol_mappings : pd.DataFrame
        The dataframe of just symbol: sid mappings. The index will be
        the sid, then there will be three columns: symbol, start_date, and
        end_date.
    """
    mappings = df[list(mapping_columns)]
    with pd.option_context('mode.chained_assignment', None):
        mappings['sid'] = mappings.index
    mappings.reset_index(drop=True, inplace=True)

    # take the most recent sid->exchange mapping based on end date
    asset_exchange = df[
        ['exchange', 'end_date']
    ].sort_values('end_date').groupby(level=0)['exchange'].nth(-1)

    _check_symbol_mappings(mappings, exchanges, asset_exchange)
    return (
        df.groupby(level=0).apply(_check_asset_group),
        mappings,
    )
示例#13
0
    def test_register_writer(self):
        # some awkward mocking to test out dispatch and such actually works
        called_save = []
        called_write_cells = []

        class DummyClass(ExcelWriter):
            called_save = False
            called_write_cells = False
            supported_extensions = ['xlsx', 'xls']
            engine = 'dummy'

            def save(self):
                called_save.append(True)

            def write_cells(self, *args, **kwargs):
                called_write_cells.append(True)

        def check_called(func):
            func()
            assert len(called_save) >= 1
            assert len(called_write_cells) >= 1
            del called_save[:]
            del called_write_cells[:]

        with pd.option_context('io.excel.xlsx.writer', 'dummy'):
            register_writer(DummyClass)
            writer = ExcelWriter('something.xlsx')
            assert isinstance(writer, DummyClass)
            df = tm.makeCustomDataframe(1, 1)
            check_called(lambda: df.to_excel('something.xlsx'))
            check_called(
                lambda: df.to_excel(
                    'something.xls', engine='dummy'))
示例#14
0
def get_data(stream, parameters, fmt):
    """Retrieve data for given stream and parameters, or None if not found"""
    sds = kp.db.StreamDS()
    if stream not in sds.streams:
        log.error("Stream '{}' not found in the database.".format(stream))
        return
    params = {}
    if parameters:
        for parameter in parameters:
            if '=' not in parameter:
                log.error(
                    "Invalid parameter syntax '{}'\n"
                    "The correct syntax is 'parameter=value'".
                    format(parameter)
                )
                continue
            key, value = parameter.split('=')
            params[key] = value
    data = sds.get(stream, fmt, **params)
    if data is not None:
        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):
            print(data)
    else:
        sds.help(stream)
示例#15
0
    def test_sum_overflow(self, use_bottleneck):

        with pd.option_context('use_bottleneck', use_bottleneck):
            # GH#6915
            # overflowing on the smaller int dtypes
            for dtype in ['int32', 'int64']:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert int(result) == v.sum(dtype='int64')
                result = s.min(skipna=False)
                assert int(result) == 0
                result = s.max(skipna=False)
                assert int(result) == v[-1]

            for dtype in ['float32', 'float64']:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert result == v.sum(dtype=dtype)
                result = s.min(skipna=False)
                assert np.allclose(float(result), 0.0)
                result = s.max(skipna=False)
                assert np.allclose(float(result), v[-1])
示例#16
0
def main():
	game_details = Readme()  #read in game details for all games by conference
	ListofTeams = TeamList(game_details)  #get list of teams for which there is PBP data
	
	print "Got Game Details"
	with pd.option_context('display.max_rows', 500, 'display.max_columns', 2):
		print ListofTeams

	ListofTeams.to_csv("ListofTeams.csv",sep = ',')

	##takes a number on list of teams th
	Num_team_choice = int(raw_input("Please select the number corresponding to the team you want: "))
	#Num_team_choice = 125
	#name of team chosen 
	Team_Choice = ListofTeams.iloc[Num_team_choice][0] 

	##get schedule for team of choice.  returns a dictionary with columns "fullsched" and "game_days"
	TeamSched = schedule(Team_Choice, game_details)
	print "Got Schedule for %s" % Team_Choice

	merged_data = pbp_stats(TeamSched["fullsched"], TeamSched["game_days"], game_details, Team_Choice)
	print "Merged Data"
	
	#print merged_data.head(10)
	filename = "../Processed-PBP/%s.csv" % Team_Choice
	#filename = "../Processed-PBP/test.csv"

	merged_data.to_csv(filename,sep = '\t')
	print "Done!"
示例#17
0
    def test_representation_to_series(self):
        idx1 = TimedeltaIndex([], freq='D')
        idx2 = TimedeltaIndex(['1 days'], freq='D')
        idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D')
        idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D')
        idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days'])

        exp1 = """Series([], dtype: timedelta64[ns])"""

        exp2 = ("0   1 days\n"
                "dtype: timedelta64[ns]")

        exp3 = ("0   1 days\n"
                "1   2 days\n"
                "dtype: timedelta64[ns]")

        exp4 = ("0   1 days\n"
                "1   2 days\n"
                "2   3 days\n"
                "dtype: timedelta64[ns]")

        exp5 = ("0   1 days 00:00:01\n"
                "1   2 days 00:00:00\n"
                "2   3 days 00:00:00\n"
                "dtype: timedelta64[ns]")

        with pd.option_context('display.width', 300):
            for idx, expected in zip([idx1, idx2, idx3, idx4, idx5],
                                     [exp1, exp2, exp3, exp4, exp5]):
                result = repr(pd.Series(idx))
                assert result == expected
示例#18
0
    def test_representation(self, method):
        idx1 = TimedeltaIndex([], freq='D')
        idx2 = TimedeltaIndex(['1 days'], freq='D')
        idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D')
        idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D')
        idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days'])

        exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')"""

        exp2 = ("TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', "
                "freq='D')")

        exp3 = ("TimedeltaIndex(['1 days', '2 days'], "
                "dtype='timedelta64[ns]', freq='D')")

        exp4 = ("TimedeltaIndex(['1 days', '2 days', '3 days'], "
                "dtype='timedelta64[ns]', freq='D')")

        exp5 = ("TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', "
                "'3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)")

        with pd.option_context('display.width', 300):
            for idx, expected in zip([idx1, idx2, idx3, idx4, idx5],
                                     [exp1, exp2, exp3, exp4, exp5]):
                result = getattr(idx, method)()
                assert result == expected
示例#19
0
    def test_representation(self):
        idx1 = TimedeltaIndex([], freq='D')
        idx2 = TimedeltaIndex(['1 days'], freq='D')
        idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D')
        idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D')
        idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days'])

        exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')"""

        exp2 = ("TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', "
                "freq='D')")

        exp3 = ("TimedeltaIndex(['1 days', '2 days'], "
                "dtype='timedelta64[ns]', freq='D')")

        exp4 = ("TimedeltaIndex(['1 days', '2 days', '3 days'], "
                "dtype='timedelta64[ns]', freq='D')")

        exp5 = ("TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', "
                "'3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)")

        with pd.option_context('display.width', 300):
            for idx, expected in zip([idx1, idx2, idx3, idx4, idx5],
                                     [exp1, exp2, exp3, exp4, exp5]):
                for func in ['__repr__', '__unicode__', '__str__']:
                    result = getattr(idx, func)()
                    self.assertEqual(result, expected)
示例#20
0
    def test_representation_to_series(self):
        idx1 = TimedeltaIndex([], freq='D')
        idx2 = TimedeltaIndex(['1 days'], freq='D')
        idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D')
        idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D')
        idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days'])

        exp1 = """Series([], dtype: timedelta64[ns])"""

        exp2 = """0   1 days
dtype: timedelta64[ns]"""

        exp3 = """0   1 days
1   2 days
dtype: timedelta64[ns]"""

        exp4 = """0   1 days
1   2 days
2   3 days
dtype: timedelta64[ns]"""

        exp5 = """0   1 days 00:00:01
1   2 days 00:00:00
2   3 days 00:00:00
dtype: timedelta64[ns]"""

        with pd.option_context('display.width', 300):
            for idx, expected in zip([idx1, idx2, idx3, idx4, idx5],
                                     [exp1, exp2, exp3, exp4, exp5]):
                result = repr(pd.Series(idx))
                self.assertEqual(result, expected)
示例#21
0
    def test_repr_array_long(self, data):
        # some arrays may be able to assert a ... in the repr
        with pd.option_context('display.max_seq_items', 1):
            result = repr(data)

            assert '...' in result
            assert 'length' in result
示例#22
0
文件: views.py 项目: emwalker/lenrmc
    def call(self):
        """Print to the io object."""
        self.io.write('')
        self.io.write('At second:      {}'.format(self.kwargs.get('seconds')))
        self.io.write('Starting moles: {}'.format(self.kwargs.get('moles')))
        self.io.write('Activity:       {:.2e}'.format(self.scenario.activity()))
        self.io.write('Watts:          {:.2e}'.format(self.scenario.power().watts))
        self.io.write('')

        df = self.scenario.df[[
            'parent',
            'daughters',
            'parent_fraction',
            'q_value_mev',
            'starting_moles',
            'gamow_factor',
            'partial_half_life',
            'partial_activity',
            'watts',
        ]]

        if df.empty:
            self.io.write('No active isotopes.')
        else:
            with pd.option_context('display.max_rows', 999, 'display.max_columns', 10):
                df = df.dropna().sort_values(['watts', 'gamow_factor'], ascending=[0, 1])
                self.io.write(df.to_string() + '\n')
        self.io.write('')
示例#23
0
    def draw(self, return_ggplot=False):
        """
        Render the complete plot

        Parameters
        ----------
        return_ggplot : bool
            If ``True``, return ggplot object.

        Returns
        -------
        fig : ~matplotlib.figure.Figure
            Matplotlib figure
        plot : ggplot (optional)
            The ggplot object used for drawn, if ``return_ggplot`` is
            ``True``.

        Notes
        -----
        This method does not modify the original ggplot object. You can
        get the modified ggplot object with :py:`return_ggplot=True`.
        """
        # Pandas deprecated is_copy, and when we create new dataframes
        # from slices we do not want complaints. We always uses the
        # new frames knowing that they are separate from the original.
        with pd.option_context('mode.chained_assignment', None):
            return self._draw(return_ggplot)
    def _check_stat_op(self, name, alternate, string_series_,
                       check_objects=False, check_allna=False):

        with pd.option_context('use_bottleneck', False):
            f = getattr(Series, name)

            # add some NaNs
            string_series_[5:15] = np.NaN

            # mean, idxmax, idxmin, min, and max are valid for dates
            if name not in ['max', 'min', 'mean']:
                ds = Series(pd.date_range('1/1/2001', periods=10))
                with pytest.raises(TypeError):
                    f(ds)

            # skipna or no
            assert pd.notna(f(string_series_))
            assert pd.isna(f(string_series_, skipna=False))

            # check the result is correct
            nona = string_series_.dropna()
            tm.assert_almost_equal(f(nona), alternate(nona.values))
            tm.assert_almost_equal(f(string_series_), alternate(nona.values))

            allna = string_series_ * np.nan

            if check_allna:
                assert np.isnan(f(allna))

            # dtype=object with None, it works!
            s = Series([1, 2, 3, None, 5])
            f(s)

            # GH#2888
            items = [0]
            items.extend(lrange(2 ** 40, 2 ** 40 + 1000))
            s = Series(items, dtype='int64')
            tm.assert_almost_equal(float(f(s)), float(alternate(s.values)))

            # check date range
            if check_objects:
                s = Series(pd.bdate_range('1/1/2000', periods=10))
                res = f(s)
                exp = alternate(s)
                assert res == exp

            # check on string data
            if name not in ['sum', 'min', 'max']:
                with pytest.raises(TypeError):
                    f(Series(list('abc')))

            # Invalid axis.
            with pytest.raises(ValueError):
                f(string_series_, axis=1)

            # Unimplemented numeric_only parameter.
            if 'numeric_only' in compat.signature(f).args:
                with pytest.raises(NotImplementedError, match=name):
                    f(string_series_, numeric_only=True)
示例#25
0
def pca_signal(signal):
    # Data may contain "Inf" or "NaN" values for some rages, let's just skip
    # such values otherwise PCA will fail
    with pd.option_context('mode.use_inf_as_null', True):
        signal = signal.dropna(how="any", axis=0).T
    pca = PCA(n_components=2)
    x_r = pca.fit_transform(signal)
    return pca, x_r
示例#26
0
def get_describe(df):
    desc = df.describe()
    print(desc)
    desc = pd.DataFrame([df.median(), df.mean(), df.std(ddof=0)], index=['median', 'mean', 'std'])
    # print(desc.ix[['mean', 'std']])
    with pd.option_context('display.precision', 4):
        print(desc)
    return desc
示例#27
0
    def test_print_none_width(self):
        # GH10087
        a = Series(Categorical([1, 2, 3, 4]))
        exp = ("0    1\n1    2\n2    3\n3    4\n"
               "dtype: category\nCategories (4, int64): [1, 2, 3, 4]")

        with option_context("display.width", None):
            assert exp == repr(a)
示例#28
0
 def test_setitem_chained_no_consolidate(self):
     # https://github.com/pandas-dev/pandas/pull/19268
     # issuecomment-361696418
     # chained setitem used to cause consolidation
     sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]])
     with pd.option_context('mode.chained_assignment', None):
         sdf[0][1] = 2
     assert len(sdf._data.blocks) == 2
示例#29
0
def freqTable(grams):
    dataOut = open("dataOut.txt", "w")
    # lines = [line.strip() for line in grams if line.strip() and not line.startswith('com')]
    lineSer = pd.Series(grams)
    freq = lineSer.value_counts()
    freq.to_csv("dataOut.txt")
    with pd.option_context("display.max_rows", 999):
        print freq
示例#30
0
    def test_detect_chained_assignment_warnings(self):

        # warnings
        with option_context('chained_assignment', 'warn'):
            df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]})
            with tm.assert_produces_warning(
                    expected_warning=com.SettingWithCopyWarning):
                df.loc[0]['A'] = 111
示例#31
0
print('Preparing pivot tables.')
df_pvt1 = pd.pivot_table(df_inv,
                         index=['_grp', 'YEAR', 'PERIOD'],
                         values=['_adjvol'],
                         columns=[],
                         aggfunc='sum',
                         margins=False,
                         margins_name='Total',
                         fill_value='')
query_text = '_grp != [\'' + conf.NF + '\',\'' + conf.UD + '\']'
df_out_actual = df_pvt1.query(
    query_text)  # output pivot table - Historical demand
df_out_flat = pd.DataFrame(df_out_actual.to_records()
                           )  # *** flattened pivot table for forecasting ***

with pd.option_context('display.max_rows', 100000, 'display.max_columns', 6,
                       'display.expand_frame_repr', False):
    print(df_out_flat[['_grp', 'YEAR', 'PERIOD', '_adjvol']])

# df_pvt2 will be used for reporting out chemical prefixes without chemical groups (df_not_grouped).
df_pvt2 = pd.pivot_table(df_inv,
                         index=['_grp', '_pref'],
                         values=['_adjvol'],
                         columns=[],
                         aggfunc='sum',
                         margins=False,
                         margins_name='Total',
                         fill_value='')
query_text = '_grp == [\'' + conf.NF + '\',\'' + conf.UD + '\']'
df_not_grouped = df_pvt2.query(
    query_text)  # chemical prefixes remained to be manually grouped in Excel
示例#32
0
 def print_pandas_array(self, array):
     import pandas as pd
     if len(array) > 0:
         with pd.option_context('display.max_rows', None, 'display.max_columns', None):
             print(array)
示例#33
0
 def _get_formatted_values(self):
     with option_context('display.max_colwidth', 999999):
         fmt_values = {i: self.fmt._format_col(i)
                       for i in range(self.ncols)}
     return fmt_values
示例#34
0
def build(source_index, dest_index, W=10):
    _dataset = load_dataset(source_index, return_index=True)

    for _sym, entry in _dataset.items():
        _df = pd.read_csv(entry['csv'],
                          sep=',',
                          encoding='utf-8',
                          index_col='Date',
                          parse_dates=True)
        _target = pd.read_csv(entry['target_csv'],
                              sep=',',
                              encoding='utf-8',
                              index_col='Date',
                              parse_dates=True)
        ohlcv = _df[entry['features']['ohlcv']]

        ohlcv_d = {
            d: _df[entry['features']['ohlcv_{}d'.format(d)]]
            for d in [3, 7, 30]
        }
        ta_d = {
            d: _df[entry['features']['ta_{}d'.format(d)]]
            for d in [3, 7, 30]
        }

        ta = _df[entry['features']['ta']]
        cm = _df[entry['features']['cm']]

        cm_picked = pd.DataFrame(index=ohlcv.index)
        if 'adractcnt' in cm.columns:
            cm_picked['adractcnt_pct'] = cm.adractcnt.pct_change()
            # cm_picked['adractcnt_mean3_pct'] = cm.adractcnt.rolling(3).mean().pct_change()
            # cm_picked['adractcnt_mean7_pct'] = cm.adractcnt.rolling(7).mean().pct_change()
        # if 'splycur' in cm.columns: ## Correlated with volume and close
        #     cm_picked['vol_supply'] = ohlcv.volume / cm.splycur # Ratio between transacted volume and total supply (mined)
        if 'txtfrvaladjntv' in cm.columns and 'isstotntv' in cm.columns and 'feetotntv' in cm.columns:
            # I want to represent miners earnings (fees + issued coins) vs amount transacted in that interval
            cm_picked['earned_vs_transacted'] = (
                cm.isstotntv + cm.feetotntv) / cm.txtfrvaladjntv
        if 'isstotntv' in cm.columns:
            # isstotntv is total number of coins mined in the time interval
            # splycur is total number of coins mined (all time)
            total_mined = cm.isstotntv.rolling(
                365, min_periods=7).sum()  # total mined in a year
            cm_picked['isstot365_isstot1_pct'] = (cm.isstotntv /
                                                  total_mined).pct_change()
        if 'splycur' in cm.columns and 'isstotntv' in cm.columns:
            cm_picked['splycur_isstot1_pct'] = (cm.isstotntv /
                                                cm.splycur).pct_change()
        if 'hashrate' in cm.columns:
            #cm_picked['hashrate_mean3_pct'] = cm.hashrate.rolling(3).mean().pct_change()
            #cm_picked['hashrate_mean7_pct'] = cm.hashrate.rolling(7).mean().pct_change()
            cm_picked['hashrate_pct'] = cm.hashrate.pct_change()
        if 'roi30d' in cm.columns:
            cm_picked['roi30d'] = cm.roi30d
        if 'isstotntv' in cm.columns:
            cm_picked['isstotntv_pct'] = cm.isstotntv.pct_change()
        if 'feetotntv' in cm.columns:
            cm_picked['feetotntv_pct'] = cm.feetotntv.pct_change()
        if 'txtfrcount' in cm.columns:
            cm_picked['txtfrcount_pct'] = cm.txtfrcount.pct_change()
            #cm_picked['txtfrcount_volume'] = cm.txtfrcount.pct_change()
        if 'vtydayret30d' in cm.columns:
            cm_picked['vtydayret30d'] = cm.vtydayret30d
        if 'isscontpctann' in cm.columns:
            cm_picked['isscontpctann'] = cm.isscontpctann

        ta_picked = pd.DataFrame(index=ta.index)
        # REMA / RSMA are already used and well-estabilished in ATSA,
        # I'm taking the pct change since i want to encode the relative movement of the ema's not their positions
        # ta_picked['rema_5_20_pct'] = ta.rema_5_20.pct_change()
        ta_picked['rema_8_15_pct'] = ta.rema_8_15.pct_change()
        # ta_picked['rema_20_50_pct'] = ta.rema_20_50.pct_change()
        # ta_picked['rsma_5_20_pct'] = ta.rema_5_20.pct_change()
        ta_picked['rsma_8_15_pct'] = ta.rema_8_15.pct_change()
        # ta_picked['rsma_20_50_pct'] = ta.rema_20_50.pct_change()

        # Stoch is a momentum indicator comparing a particular closing price of a security to a range of its prices
        # over a certain period of time.
        # The sensitivity of the oscillator to market movements is reducible by adjusting that time period or
        # by taking a moving average of the result.
        # It is used to generate overbought and oversold trading signals, utilizing a 0-100 bounded range of values.
        # IDEA => decrease sensitivity by 3-mean and divide by 100 to get fp values
        ta_picked['stoch_14_mean3_div100'] = ta.stoch_14.rolling(
            3).mean() / 100

        #Moving Average Convergence Divergence (MACD) is a trend-following momentum indicator that shows
        # the relationship between two moving averages of a security’s price.
        # The MACD is calculated by subtracting the 26-period Exponential Moving Average (EMA) from the 12-period EMA.
        #  A nine-day EMA of the MACD called the "signal line," is then plotted on top of the MACD line,
        #  which can function as a trigger for buy and sell signals.
        #  Traders may buy the security when the MACD crosses above its signal line and sell - or short - the security
        #  when the MACD crosses below the signal line.
        #  Moving Average Convergence Divergence (MACD) indicators can be interpreted in several ways,
        #  but the more common methods are crossovers, divergences, and rapid rises/falls.
        signal_line = builder.exponential_moving_average(ta.macd_12_26, 9)
        ta_picked[
            'macd_12_26_signal'] = signal_line  # Relationship with signal line
        ta_picked['macd_12_26_diff_signal'] = (
            ta.macd_12_26 -
            signal_line).pct_change()  # Relationship with signal line
        ta_picked['macd_12_26_pct'] = ta.macd_12_26.pct_change(
        )  # Information about slope

        # PPO is identical to the moving average convergence divergence (MACD) indicator,
        # except the PPO measures percentage difference between two EMAs, while the MACD measures absolute (dollar) difference.
        signal_line = builder.exponential_moving_average(ta.ppo_12_26, 9)
        ta_picked[
            'ppo_12_26_signal'] = signal_line  # Relationship with signal line
        ta_picked['ppo_12_26_diff_signal'] = (
            ta.ppo_12_26 -
            signal_line).pct_change()  # Relationship with signal line
        ta_picked['ppo_12_26_pct'] = ta.ppo_12_26.pct_change(
        )  # Information about slope

        # ADI Accumulation/distribution is a cumulative indicator that uses volume and price to assess whether
        # a stock is being accumulated or distributed.
        # The accumulation/distribution measure seeks to identify divergences between the stock price and volume flow.
        # This provides insight into how strong a trend is. If the price is rising but the indicator is falling
        # this indicates that buying or accumulation volume may not be enough to support
        # the price rise and a price decline could be forthcoming.
        # ==> IDEA: if we can fit a line to the price y1 = m1X+q1 and a line to ADI y2=m2X+q2 then we can identify
        #           divergences by simply looking at the sign of M.
        #           Another insight would be given by the slope (ie pct_change)
        ta_picked['adi_pct'] = ta.adi.pct_change()
        ta_picked['adi_close_convergence'] = convergence_between_series(
            ta.adi, ohlcv.close, 3)

        # RSI goes from 0 to 100, values <= 20 mean BUY, while values >= 80 mean SELL.
        # Dividing it by 100 to get a floating point feature, makes no sense to pct_change it
        ta_picked['rsi_14_div100'] = ta.rsi_14 / 100

        # The Money Flow Index (MFI) is a technical indicator that generates overbought or oversold
        #   signals using both prices and volume data. The oscillator moves between 0 and 100.
        # An MFI reading above 80 is considered overbought and an MFI reading below 20 is considered oversold,
        #   although levels of 90 and 10 are also used as thresholds.
        # A divergence between the indicator and price is noteworthy. For example, if the indicator is rising while
        #   the price is falling or flat, the price could start rising.
        ta_picked['mfi_14_div100'] = ta.mfi_14 / 100

        # The Chande momentum oscillator is a technical momentum indicator similar to other momentum indicators
        #   such as Wilder’s Relative Strength Index (Wilder’s RSI) and the Stochastic Oscillator.
        #   It measures momentum on both up and down days and does not smooth results, triggering more frequent
        #   oversold and overbought penetrations. The indicator oscillates between +100 and -100.
        # Many technical traders add a 10-period moving average to this oscillator to act as a signal line.
        #   The oscillator generates a bullish signal when it crosses above the moving average and a
        #   bearish signal when it drops below the moving average.
        ta_picked['cmo_14_div100'] = ta.cmo_14 / 100
        signal_line = builder.simple_moving_average(ta.cmo_14, 10)
        ta_picked['cmo_14_signal'] = signal_line
        ta_picked['cmo_14_diff_signal'] = (ta.cmo_14 - signal_line) / 100

        # On-balance volume (OBV) is a technical trading momentum indicator that uses volume flow to predict changes in stock price.
        # Eventually, volume drives the price upward. At that point, larger investors begin to sell, and smaller investors begin buying.
        # Despite being plotted on a price chart and measured numerically,
        # the actual individual quantitative value of OBV is not relevant.
        # The indicator itself is cumulative, while the time interval remains fixed by a dedicated starting point,
        # meaning the real number value of OBV arbitrarily depends on the start date.
        # Instead, traders and analysts look to the nature of OBV movements over time;
        # the slope of the OBV line carries all of the weight of analysis. => We want percent change
        ta_picked['obv_pct'] = ta.obv.pct_change()
        ta_picked['obv_mean3_pct'] = ta.obv.rolling(3).mean().pct_change()

        # Strong rallies in price should see the force index rise.
        # During pullbacks and sideways movements, the force index will often fall because the volume
        # and/or the size of the price moves gets smaller.
        # => Encoding the percent variation could be a good idea
        ta_picked['fi_13_pct'] = ta.fi_13.pct_change()
        ta_picked['fi_50_pct'] = ta.fi_50.pct_change()

        # The Aroon Oscillator is a trend-following indicator that uses aspects of the
        # Aroon Indicator (Aroon Up and Aroon Down) to gauge the strength of a current trend
        # and the likelihood that it will continue.
        # It moves between -100 and 100. A high oscillator value is an indication of an uptrend
        # while a low oscillator value is an indication of a downtrend.
        ta_picked['ao_14'] = ta.ao_14 / 100

        # The average true range (ATR) is a technical analysis indicator that measures market volatility
        #   by decomposing the entire range of an asset price for that period.
        # ATRP is pct_change of volatility
        ta_picked['atrp_14'] = ta.atrp_14

        # Percentage Volume Oscillator (PVO) is momentum volume oscillator used in technical analysis
        #   to evaluate and measure volume surges and to compare trading volume to the average longer-term volume.
        # PVO does not analyze price and it is based solely on volume.
        #  It compares fast and slow volume moving averages by showing how short-term volume differs from
        #  the average volume over longer-term.
        #  Since it does not care a trend's factor in its calculation (only volume data are used)
        #  this technical indicator cannot be used alone to predict changes in a trend.
        ta_picked['pvo_12_26'] = ta.pvo_12_26

        # IGNORED: tsi, wd, adx,

        #lagged_stats = pd.concat([ohlcv_stats] + [builder.make_lagged(ohlcv_stats, i) for i in range(1,10+1)], axis='columns', verify_integrity=True, sort=True, join='inner')

        # Build the dataframe with base features
        # lagged_close = pd.concat([ohlcv.close.pct_change()] + [builder.make_lagged(ohlcv.close.pct_change(), i) for i in range(1,10+1)], axis='columns', verify_integrity=True, sort=True, join='inner')
        # lagged_close.columns = ['close_pct'] + ['close_pct_lag-{}'.format(i) for i in range(1, W +1)]

        ohlc = ohlcv[['open', 'high', 'low', 'close', 'volume']].pct_change()
        ohlc.columns = ['{}_pct'.format(c) for c in ohlcv.columns]
        lagged_ohlc_pct = pd.concat(
            [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)],
            axis='columns',
            verify_integrity=True,
            sort=True,
            join='inner')

        _time = pd.DataFrame(index=ohlcv.index)
        _time['day_of_year'] = ohlcv.index.dayofyear
        _time['day_of_week'] = ohlcv.index.dayofweek

        ohlc = ohlcv[['open', 'high', 'low', 'close', 'volume']]
        x_space = np.linspace(0, ohlc.index.size, ohlc.index.size)
        _splines = pd.DataFrame(index=ohlcv.index)

        # Highly correlated between themselves, no use
        # _splines['open_spl'] = get_spline(ohlc.open, 0)
        # _splines['high_spl'] = get_spline(ohlc.high, 0)
        # _splines['low_spl'] = get_spline(ohlc.low, 0)
        # _splines['close_spl'] = get_spline(ohlc.close, 0)

        _splines['open_spl_d1'] = builder.get_spline(ohlc.open, 1)
        _splines['high_spl_d1'] = builder.get_spline(ohlc.high, 1)
        _splines['low_spl_d1'] = builder.get_spline(ohlc.low, 1)
        _splines['close_spl_d1'] = builder.get_spline(ohlc.close, 1)

        _splines['open_spl_d2'] = builder.get_spline(ohlc.open, 2)
        _splines['high_spl_d2'] = builder.get_spline(ohlc.high, 2)
        _splines['low_spl_d2'] = builder.get_spline(ohlc.low, 2)
        _splines['close_spl_d2'] = builder.get_spline(ohlc.close, 2)

        _patterns = builder.get_talib_patterns(ohlcv)
        _new_features = pd.DataFrame(index=ohlcv.index)
        _new_features['candlestick_patterns_mean'] = _patterns.mean(axis=1)
        _new_features['candlestick_patterns_sum'] = _patterns.sum(axis=1)
        # WE LIKE THESE TWO!!!!
        _new_features['close_volatility_7d'] = ohlcv.close.pct_change(
        ).rolling(7).std(ddof=0)
        _new_features['close_volatility_30d'] = ohlcv.close.pct_change(
        ).rolling(30).std(ddof=0)
        #
        # Candle body size variation, for example
        _new_features['close_open_pct'] = (
            ohlcv.close - ohlcv.open
        ).pct_change()  # Change in body of the candle (> 0 if candle is green)
        _new_features['high_close_dist_pct'] = (
            ohlcv.high - ohlcv.close
        ).pct_change(
        )  # Change in wick size of the candle, shorter wick should be bullish
        _new_features['low_close_dist_pct'] = (
            ohlcv.close - ohlcv.low
        ).pct_change(
        )  # Change in shadow size of the candle, this increasing would indicate support (maybe a bounce)
        _new_features['high_low_dist_pct'] = (
            ohlcv.high - ohlcv.low
        ).pct_change(
        )  # Change in total candle size, smaller candles stands for low volatility

        for d in [3, 7, 30]:
            ohlcv_d[d].columns = ['close', 'high', 'low', 'open', 'volume']
            _new_features['close_open_pct_d{}'.format(d)] = (
                ohlcv_d[d].close - ohlcv_d[d].open).pct_change()
            _new_features['high_close_dist_pct_d{}'.format(d)] = (
                ohlcv_d[d].high - ohlcv_d[d].close).pct_change()
            _new_features['low_close_dist_pct_d{}'.format(d)] = (
                ohlcv_d[d].close - ohlcv_d[d].low).pct_change()
            _new_features['high_low_dist_pct_d{}'.format(d)] = (
                ohlcv_d[d].high - ohlcv_d[d].low).pct_change()

        _ta_windowed_features = pd.concat([
            v.rename(columns={c: '{}_ta{}d'.format(c, d)
                              for c in v.columns}) for d, v in ta_d.items()
        ],
                                          axis=1)
        # Add lagged features to the dataframe
        ta.columns = ['{}_ta1d'.format(c) for c in ta.columns]
        feature_groups = [
            _new_features, _splines, lagged_ohlc_pct, cm_picked, ta_picked,
            _ta_windowed_features, ta
        ]

        improved_df = pd.concat(feature_groups,
                                axis='columns',
                                verify_integrity=True,
                                sort=True,
                                join='inner')

        # Drop the first 30 rows
        improved_df = improved_df[30:]
        # Drop columns whose values are all nan or inf
        with pd.option_context('mode.use_inf_as_na',
                               True):  # Set option temporarily
            improved_df = improved_df.dropna(axis='columns', how='all')
        logger.info('Saving {}'.format(_sym))
        save_symbol_dataset(dest_index, _sym, improved_df, target=_target)
        logger.info('Saved {}'.format(_sym))
示例#35
0
def update_results():
    """Update the results table after a batch, cell or model selection
    is changed.

    """

    user = get_current_user()
    session = Session()
    nullselection = """
            MUST SELECT A BATCH AND ONE OR MORE CELLS AND
            ONE OR MORE MODELS BEFORE RESULTS WILL UPDATE
            """

    bSelected = request.args.get('bSelected')
    cSelected = request.args.getlist('cSelected[]')
    mSelected = request.args.getlist('mSelected[]')
    colSelected = request.args.getlist('colSelected[]')
    # If no batch, cell or model is selected, display an error message.
    if (len(bSelected) == 0) or (not cSelected) or (not mSelected):
        return jsonify(resultstable=nullselection)
    # Only get numerals for selected batch.
    bSelected = bSelected[:3]
    # Use default value of 500 if no row limit is specified.
    rowlimit = request.args.get('rowLimit', 500)
    ordSelected = request.args.get('ordSelected')
    # Parse string into appropriate sqlalchemy method
    if ordSelected == 'asc':
        ordSelected = asc
    elif ordSelected == 'desc':
        ordSelected = desc
    sortSelected = request.args.get('sortSelected', 'cellid')

    # Always add cellid and modelname to column lists,
    # since they are required for selection behavior.
    cols = [
        getattr(NarfResults, 'cellid'),
        getattr(NarfResults, 'modelname'),
    ]
    cols += [
        getattr(NarfResults, c) for c in colSelected
        if hasattr(NarfResults, c)
    ]

    # Package query results into a DataFrame
    results = psql.read_sql_query(
        Query(cols, session).filter(NarfResults.batch == bSelected).filter(
            NarfResults.cellid.in_(cSelected)).filter(
                NarfResults.modelname.in_(mSelected)).filter(
                    or_(
                        int(user.sec_lvl) == 9,
                        NarfResults.public == '1',
                        NarfResults.labgroup.ilike('%{0}%'.format(
                            user.labgroup)),
                        NarfResults.username == user.username,
                    )).order_by(ordSelected(getattr(
                        NarfResults, sortSelected))).limit(rowlimit).statement,
        session.bind)
    with pd.option_context('display.max_colwidth', -1):
        resultstable = results.to_html(
            index=False,
            classes="table-hover table-condensed",
        )

    session.close()

    return jsonify(resultstable=resultstable)
示例#36
0
    def test_config_default_off(self):
        df = pd.DataFrame({"A": [1, 2]})
        with pd.option_context("display.html.table_schema", False):
            result = df._repr_data_resource_()

        assert result is None
示例#37
0
def main():

    # Build new directories
    new_dir(args.path)
    export_path = args.path + "/export_conll"
    new_dir(export_path)
    import_path = args.path + "/import_conll"
    new_dir(import_path)

    StringProcessor = string_preprocessing.Preprocessor()

    # Get source text
    if args.txt:
        with open(args.txt) as f:
            raw_text = f.read()
        all_sentences = StringProcessor.process(raw_text)
    elif args.copy_beware_columns:
        all_sentences, all_conlls = conll_and_spacy.ConllSpacyUpdater.load_all_conlls(
            export_path)
    else:
        raise OSError("Either txt or conll must be given to know the text")

    if args.subdivision:
        all_sentences, subdivision_structure = StringProcessor.extract_subdivision_structure(
            args.subdivision, all_sentences)

    GrammarParser = parse_grammar.GrammarParser()
    ConllUpdater = conll_and_spacy.ConllSpacyUpdater(export_dir=export_path,
                                                     import_dir=import_path)

    # Fit spacy sparses with bad sentence chunking into well chunked nltk sentences and build the new conlls
    chunk_width = 10
    line_chunks, non_over_lapping_intervall = batch_splitter(all_sentences,
                                                             chunk_width,
                                                             overlap_margin=5)

    s_counter = count(0)  # Count generator for
    j = next(s_counter)  # Index of sentence in the Corpus

    conll_df = pd.DataFrame()

    # Compute the parts of the corpus als blocks of sentence lists, ignore the margins for coref resolution, handle corefs for each block.
    for i, chintervall in enumerate(
            list(zip(line_chunks, non_over_lapping_intervall))):
        corpus_index = chunk_width * i
        ch, intervall = chintervall

        chunk_text = " ".join(ch)
        spacy_neucoref_doc = GrammarParser.process(chunk_text)
        spacy_position = 0

        conll_dict = []

        for ch_j, sentence_from_chunk in enumerate(ch):

            tokens = StringProcessor.tokenize_text_to_words(
                sentence_from_chunk)
            start_token = tokens[0]
            start_pos = 0
            try:
                end_pos, last_token = next(
                    (i, t) for i, t in list(enumerate(tokens))[::-1]
                    if t not in ['.', '?', '!'])
            except StopIteration:
                logging.error(
                    "no last token here? '%s' for sentence no %d: '%s'" %
                    (str(tokens), j, str(sentence_from_chunk)))

            if '/' in last_token:
                last_token = last_token.split('/')[-1]
            sent_start = conll_and_spacy.find_position_in_doc_by_approx(
                spacy_neucoref_doc, start_token, spacy_position + start_pos)
            sent_end = conll_and_spacy.find_position_in_doc_by_approx(
                spacy_neucoref_doc, last_token, spacy_position + end_pos)

            if tokens[-1] in ['.', '?', '!']:
                dot = 1
            else:
                dot = 0
            sentence_from_spacy = spacy_neucoref_doc[sent_start:sent_end + 1 +
                                                     dot]

            spacy_position = sent_start + len(tokens)

            if not ch_j in range(*intervall):
                continue

            if args.copy_beware_columns:
                ConllUpdater.conll_over_spacy(sentence_from_spacy,
                                              import_path,
                                              j,
                                              no_cols=args.copy_beware_columns)

            conll_dict.extend(
                ConllUpdater.export_dict(sentence_from_spacy, index=j))
            j = next(s_counter)

        single_chunk_conll_df = pd.DataFrame(conll_dict)
        ConllUpdater.annotate_corefs(spacy_neucoref_doc, single_chunk_conll_df)
        # Updates the df with the coref-annotations as whole doc block, because the coref annotations are not complete, asking the tokens.

        #single_chunk_conll_df = single_chunk_conll_df.iloc[range(*intervall)]
        conll_df = conll_df.append(single_chunk_conll_df, ignore_index=True)

    # Groupby df sometimes doesn't contain the column, that it is grouped by. Copy this!
    conll_df['sent_id'] = conll_df['s_id']

    # Write all the conll files
    conll_df.groupby(
        ['s_id']).apply(lambda x: ConllUpdater.write_conll_by_df_group(x))

    with open(export_path + "/lemmas.txt", 'w+') as f:
        f.write(" ".join(conll_df['lemma'].tolist()))

    with open(export_path + "/subdivision.txt", 'w+') as f:
        f.write(str(subdivision_structure))

    test_fun = test_equality_of_sentences(all_sentences)
    test_df = conll_df.groupby(['sent_id']).apply(lambda x: test_fun(x))
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        print(test_df)

    return 0
示例#38
0
                   to_replace=unit, value="Year")
    else:
        continue
    #Filling missing values with median \n",
df['pw_amount_9089'] = df['pw_amount_9089'].fillna(
    (df['pw_amount_9089'].median()))
#Changing format from string to float\n",
df['pw_amount_9089'] = df.pw_amount_9089.astype(float)
#Displaying 10 first values\n",
df[['pw_amount_9089', 'pw_unit_of_pay_9089']].head(10)

# In[31]:

#Since running \"describe\" method on \"pw_amount_9089\" column returned exponential values, I decided to \n",
#convert them to floats so that they are easier to understand\n",
with pd.option_context('float_format', '{:.2f}'.format):
    print(df.pw_amount_9089.describe())

# In[32]:

#Dividing our continuous income values into some categories to facilitate their visualization\n",
df['remuneration'] = pd.cut(df['pw_amount_9089'], [
    0, 30000, 60000, 90000, 120000, 150000, 180000, 210000, 240000, 270000,
    495748000
],
                            right=False,
                            labels=[
                                "0-30k", "30-60k", "60-90k", "90-120k",
                                "120-150k", "150-180k", "180-210k", "210-240k",
                                "240-270k", "270k+"
                            ])
示例#39
0
 def test_repr_max_rows(self):
     # GH 6863
     with pd.option_context('max_rows', None):
         str(Series(range(1001)))  # should not raise exception
示例#40
0
def test_repr_with_unicode_data():
    with pd.option_context("display.encoding", "UTF-8"):
        d = {"a": ["\u05d0", 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
        index = pd.DataFrame(d).set_index(["a", "b"]).index
        assert "\\" not in repr(index)  # we don't want unicode-escaped
示例#41
0
def show_matrix(m):
    with pd.option_context('display.float_format', lambda x: "%g" % x):
        display(pd.DataFrame(m))
示例#42
0
            movies_links.append(wiki_prefix + link.get('href'))

#import pandas to convert list to data frame
import pandas as pd

df = pd.DataFrame(A, columns=['Title'])
df['Year'] = B
df['Role'] = C
df['Director'] = D
df['Links'] = movies_links

df.sort_values("Year")

question_1 = df.copy().drop("Links", axis=1)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(question_1)

###### Question 2
#Get all actors from all movies:

# for movie_link in df["Links"]:
#     if movie_link != "":
#         page = urlopen(movie_link)
#         soup = BeautifulSoup(page, features="html.parser")
#         for elem in soup.find_all("h2", text=re.compile(r'[cast|Cast]')):
#             print (elem)
#         # nu = soup.findAll('h2', re.compile("(cast|Cast)"))
#         # print (nu)

actors_list = []
        trade_size=Decimal(1_000_000),
        order_id_tag="001",
    )
    # Instantiate and add your strategy
    strategy = EMACrossBracket(config=config)
    engine.add_strategy(strategy=strategy)

    input("Press Enter to continue...")  # noqa (always Python 3)

    # Run the engine (from start to end of data)
    engine.run()

    # Optionally view reports
    with pd.option_context(
            "display.max_rows",
            100,
            "display.max_columns",
            None,
            "display.width",
            300,
    ):
        print(engine.trader.generate_account_report(SIM))
        print(engine.trader.generate_order_fills_report())
        print(engine.trader.generate_positions_report())

    # For repeated backtest runs make sure to reset the engine
    engine.reset()

    # Good practice to dispose of the object when done
    engine.dispose()
示例#44
0
    def test_summary(self, file_name=None):
        self.assert_all_tested()

        columns = [{
            "title": "name",
            "key": "name",
            "type": str
        }, {
            "title": "mean costs",
            "key": "test_mean_costs",
            "type": float
        }, {
            "title": "variance costs",
            "key": "test_variance_costs",
            "type": float
        }, {
            "title": "mean wealth",
            "key": "test_mean_wealth",
            "type": float
        }, {
            "title": "variance wealth",
            "key": "test_variance_wealth",
            "type": float
        }, {
            "title": "mean abs wealth w. price",
            "key": "test_wealth_with_price_abs_mean",
            "type": float
        }, {
            "title": "variance wealth w. price",
            "key": "test_wealth_with_price_variance",
            "type": float
        }, {
            "title": "risk (train)",
            "key": "train_risk",
            "type": float
        }, {
            "title": "risk (test)",
            "key": "test_risk",
            "type": float
        }, {
            "title": "price",
            "key": "price",
            "type": float
        }, {
            "title": "training time",
            "key": "train_time",
            "type": float
        }, {
            "title": "trainable vars",
            "key": "trainable_variables",
            "type": int
        }, {
            "title": "non-trainable vars",
            "key": "non_trainable_variables",
            "type": int
        }]

        dictionary = {}
        for col in columns:
            dictionary[col["title"]] = [col["type"](case[col["key"]]) \
                                        for case in self.testcases]

        df = pd.DataFrame(dictionary)

        appendum = "\n\n" + f"payoff: {self.test_mean_payoff: .6f}"

        if file_name is not None:
            df.to_csv(file_name, index=False)

            with open(file_name, "a") as file:
                file.write(appendum)

        with pd.option_context('display.max_columns', None):
            print(df)
示例#45
0
gnb.fit(X_train, categoria_encoded)

######################################### TEST  #########################################

X_counts2 = vectorizer.transform(df['Des_limpio']).toarray()
predicted = gnb.predict(X_counts2)

print("\n")
print(
    "######################################### Input data: ######################################### "
    + "\n")
print(df[['Descripción', 'Cargos (CLP)', 'Abonos (CLP)', 'Saldo (CLP)']])

# print("######################################### Los cargos de entrada son: ######################################### " + "\n")
print("\n")
print(
    "######################################### Resultado del modelo: ######################################### "
    + "\n")
for numero, x in enumerate(predicted):
    predicciones.append(list(le.classes_)[x])

df = df.assign(Categoria=predicciones)

with pd.option_context('display.max_rows', None):
    print(df[[
        'Descripción', 'Cargos (CLP)', 'Abonos (CLP)', 'Saldo (CLP)',
        'Categoria'
    ]])

#"""
示例#46
0
    def evaluate(self, save_as=None):
        if save_as:
            log = open(logs_save_path + "{}.log".format(save_as), "w")
        else:
            log = open("temp_{}.log".format(time.ctime()), "w")

        ystd_score = sum([
            self.portfolio.scores.get(s, 0) *
            self.portfolio.weights_ystd.get(s, 0)
            for s in self.portfolio.stock_pool
        ])

        tdy_score = sum([
            self.portfolio.scores.get(s, 0) * self.hold_weights.get(s, 0)
            for s in self.hold_weights
        ])

        print()
        print("[Before Adjust] Score: {:.6f}    Percentile: {:.4f}".format(
            ystd_score, self.score_rank(ystd_score)))
        print("[~After Adjust] Score: {:.6f}    Percentile: {:.4f}".format(
            tdy_score, self.score_rank(tdy_score)))

        self.ystd_score_rank = self.score_rank(ystd_score)
        self.tdy_score_rank = self.score_rank(tdy_score)
        #
        print(
            "[Before Adjust] Score: {:.6f}    Percentile: {:.4f}".format(
                ystd_score, self.score_rank(ystd_score)),
            file=log,
        )
        print(
            "[~After Adjust] Score: {:.6f}    Percentile: {:.4f}".format(
                tdy_score, self.score_rank(tdy_score)),
            file=log,
        )

        ysd_holding = {
            k: v
            for k, v in self.portfolio.weights_ystd.items() if v > 0
        }
        tdy_holding = {
            int(v.name.split("_")[1]): v.varValue
            for v in self.solver.variables() if v.varValue
        }
        stock_out = set(ysd_holding) - set(tdy_holding)
        stock_in = set(tdy_holding) - set(ysd_holding)
        adjust = set(tdy_holding) & set(ysd_holding)

        adjust = [
            s for s in adjust if abs(
                self.portfolio.weights_ystd.get(s, 0) -
                self.hold_weights.get(s, 0)) > 1e-6
        ]

        print("\nIn:{}\t Out:{}\t Adjust:{}\t".format(len(stock_in),
                                                      len(stock_out),
                                                      len(adjust)))
        print(
            "\nIn:{}\t Out:{}\t Adjust:{}\t".format(len(stock_in),
                                                    len(stock_out),
                                                    len(adjust)),
            file=log,
        )

        buyin_tvr = sum([self.hold_weights.get(s, 0) for s in stock_in])
        adj_tvr = sum([
            max(
                self.hold_weights.get(s, 0) -
                self.portfolio.weights_ystd.get(s, 0),
                0,
            ) for s in adjust
        ])

        print(
            "\n>> Turnover:    \n Buy in {:7.4f}   Adjust {:.4f}   Total {:.4f}"
            .format(buyin_tvr, adj_tvr, buyin_tvr + adj_tvr))

        print(
            "\n>> Turnover:    \n Buy in {:7.4f}   Adjust {:.4f}   Total {:.4f}"
            .format(buyin_tvr, adj_tvr, buyin_tvr + adj_tvr),
            file=log,
        )

        self.total_tvr = buyin_tvr + adj_tvr

        tdy_values, ystd_values = {}, {}

        for var in self.portfolio.continuous:
            tdy_values[var] = sum([
                getattr(self.portfolio, var)[s] * self.hold_weights.get(s, 0)
                for s in self.hold_weights
            ])
            ystd_values[var] = sum([
                getattr(self.portfolio, var)[s] *
                self.portfolio.weights_ystd.get(s, 0)
                for s in self.portfolio.stock_pool
            ])

            print(
                "\n>> {}:\n Before {:7.4f}   After {:.4f}   Target {:.4f}   Δ {:7.4f} "
                .format(
                    var,
                    ystd_values[var],
                    tdy_values[var],
                    getattr(self.portfolio, "{}_constraint".format(var)),
                    tdy_values[var] -
                    getattr(self.portfolio, "{}_constraint".format(var)),
                ))

            print(
                "\n>> {}:\n Before {:7.4f}   After {:.4f}   Target {:.4f}   Δ {:7.4f} "
                .format(
                    var,
                    ystd_values[var],
                    tdy_values[var],
                    getattr(self.portfolio, "{}_constraint".format(var)),
                    tdy_values[var] -
                    getattr(self.portfolio, "{}_constraint".format(var)),
                ),
                file=log,
            )

        df_info = {}
        for var in self.portfolio.descrete:
            df_info[var] = pd.DataFrame(
                {"before": getattr(self.portfolio, "{}_weights".format(var))})

            after = {
                sc: sum([
                    w * self.is_this_category(s, sc, var)
                    for s, w in self.hold_weights.items()
                ])
                for sc in getattr(self.portfolio, "{}_list".format(var))
            }

            df_info[var]["after"] = [after.get(x) for x in df_info[var].index]

            df_info[var]["target"] = [
                getattr(self.portfolio, "{}_constraint".format(var)).get(x)
                for x in df_info[var].index
            ]

            df_info[var]["Δ"] = df_info[var]["after"] - df_info[var]["target"]

            with pd.option_context("display.max_rows", 8):
                print(
                    "\n>> {}:\n\n".format(var),
                    df_info[var].sort_values(by="Δ", ascending=False),
                )
                print(
                    "\n>> {}:\n\n".format(var),
                    df_info[var].sort_values(by="Δ", ascending=False),
                    file=log,
                )

        print("\n\nNew buy in:\n", file=log)
        for i, s in enumerate(stock_in):
            print(
                "  {:0>6}   {}% -> {:.2f}% {}".format(
                    s,
                    0,
                    100 * self.hold_weights.get(s, 0),
                    [" ", "*"][s in self.reach_max],
                ),
                end=["    |   ", "\n"][(i + 1) % 3 == 0],
                file=log,
            )

        print("\n\nSell:\n", file=log)
        for i, s in enumerate(stock_out):
            print(
                "  {:0>6}   {:.2f}% -> {}%".format(
                    s, 100 * self.portfolio.weights_ystd.get(s, 0), 0),
                end=["      |   ", "\n"][(i + 1) % 3 == 0],
                file=log,
            )

        print("\n\nAdjust:\n", file=log)

        for i, s in enumerate(adjust):
            print(
                "  {:0>6}   {:.2f}% -> {:.2f}%".format(
                    s,
                    100 * self.portfolio.weights_ystd.get(s, 0),
                    100 * self.hold_weights.get(s, 0),
                ),
                end=["    | ", "\n"][(i + 1) % 3 == 0],
                file=log,
            )

        self.result = pd.Series(
            {s: self.hold_weights.get(s, 0)
             for s in tdy_holding},
            name="weights")

        self.result.index.name = "Symbol"

        if save_as:
            self.result.sort_index().to_csv(weights_save_path +
                                            "{}.csv".format(save_as))

        log.close()

        return self.result.sort_index()
示例#47
0
def printpd(o):
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):  # more options can be specified also
        print(o)
示例#48
0
    def test_detect_chained_assignment_warnings(self):
        with option_context("chained_assignment", "warn"):
            df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})

            with tm.assert_produces_warning(com.SettingWithCopyWarning):
                df.loc[0]["A"] = 111
示例#49
0
def display(
    frame: pd.DataFrame,
    order: List[str],
    guess: Optional[np.ndarray] = None,
    mode: str = "default",
    decimal: int = 2,
) -> None:
    """ Display the frame or guess through IPython.
    :param frame:
        A pd.DataFrame
    :param order:
        A permutation of ["phi", "r", "z"]
    :param guess:
        A prediction probability matrix.
    :param mode:
        One of ["default", "pairs"]
        If "pairs", then the answer is displayed in the same cell as the
        "guess" prediction. Format: "`ANSWER`[PREDICTION]"
    :param decimal:
        How many decimals places to round the guesses to.
    :return:
        None.
    """
    table = pd.DataFrame(ext.extract_input(frame, order), columns=order)
    target = ext.extract_output(frame, order).round(0)
    if target.shape[1] > 1:
        column = [chr(65 + i) for i in range(target.shape[1] - 2)]
        noise = frame["noise"].any()
        padding = frame["padding"].any()
        column.append("noise" if noise else chr(65 + target.shape[1] - 2))
        column.append("padding" if padding else chr(65 + target.shape[1] - 1))
    else:
        column = [chr(65)]
    if mode == "guess":
        out_table = pd.DataFrame(data=guess, columns=column).replace(0, "")
        table = pd.concat([table, out_table], axis=1)
    elif mode == "discrete pairs":
        guess = metrics.discrete(guess).round(0)
        data = []
        for x in range(len(guess)):
            row = []
            for y in range(len(guess[x])):
                if target[x, y] == 0 and guess[x, y] == 0:
                    row.append("")
                else:
                    t, g = int(target[x, y]), int(guess[x, y])
                    row.append("`{0}`[{1}]".format(t, g))
            data.append(row)
        out_table = pd.DataFrame(data=data, columns=column)
        table = pd.concat([table, out_table], axis=1)
    elif mode == "pairs" and guess is not None:
        guess = guess.round(decimal)
        data = []
        for x in range(len(guess)):
            row = []
            for y in range(len(guess[x])):
                if target[x, y] == 0 and guess[x, y] == 0:
                    row.append("")
                else:
                    t, g = int(target[x, y]), np.round(guess[x, y], 2)
                    row.append("`{0}`[{1}]".format(t, g))
            data.append(row)
        out_table = pd.DataFrame(data=data, columns=column)
        table = pd.concat([table, out_table], axis=1)
    else:
        out_table = pd.DataFrame(data=target, columns=column).replace(0, "")
        table = pd.concat([table, out_table], axis=1)
    with pd.option_context('display.max_columns', 0):
        IPython.display.display(table)
示例#50
0
def test_repr_max_seq_item_setting(idx):
    # GH10182
    idx = idx.repeat(50)
    with pd.option_context("display.max_seq_items", None):
        repr(idx)
        assert "..." not in str(idx)
示例#51
0
def main():

    name = 'Sioux Falls Network'

    ########################
    # bootstrap parameters #
    ########################
    boot = 5

    #################################
    # initialize discrete event env #
    #################################
    env = simpy.Environment()  # use instant simulation
    # env = simpy.rt.RealtimeEnvironment(factor=1.)  # use real time simulation

    # setup simulation processes
    bsProcess = []
    for b in range(boot):
        bs = Bootstrap(env)
        env.process(bs.processSimulation())
        bsProcess.append(bs)

    start_time = timeit.default_timer() # start simulation timer

    env.run()

    end_time = timeit.default_timer() # end simulation timer

    # compile simulation statistics
    bsTable = None
    for n, bootstrap in enumerate(bsProcess):
        df = pd.DataFrame(sorted(bootstrap.sim.data, key=lambda x: x[3]),
                          columns=['carID', 'link', 'event',
                                   'time', 'queue', 't_queue'])

        meanQlength = df.loc[df['event'] == 'departure'][
            ['link', 'queue']].groupby(['link']).mean()
        meanQlength.columns=['mean']

        varQlength = df.loc[df['event'] == 'departure'][
            ['link', 'queue']].groupby(['link']).var()
        varQlength.columns=['variance']

        maxQlength = df.loc[df['event'] == 'departure'][
            ['link', 'queue']].groupby(['link']).max()
        maxQlength.columns=['max']

        if bsTable is None:
            bsTable = maxQlength
            bsTable.columns = [1]

        else:
            bsTable[n+1] = maxQlength

    mean = bsTable.mean(axis=1)
    mse = bsTable.var(axis=1, ddof=0)
    bsTable['mean'] = mean
    bsTable['MSE'] = mse

    print('Simulation runtime: %.3fs' % (end_time-start_time))

    with pd.option_context('expand_frame_repr', False):
        print(bsTable)
def long_short_trend(transaction_cost=0.0000, plot=False):
    start_bt_date_1yr_plus = '2014-09-13'
    start_bt_date = '2015-09-13'
    end_bt_date = '2020-09-13'
    df, stoxx600 = get_df_stoxx600(
        start_bt_date_1yr_plus=start_bt_date_1yr_plus, end_bt_date=end_bt_date)
    # df, stoxx600 = get_df_sp500(start_bt_date_1yr_plus=start_bt_date_1yr_plus, end_bt_date=end_bt_date)

    ## short index

    port_ret = stoxx600.loc[start_bt_date:end_bt_date].pct_change().rename(
        'short_index').to_frame()

    ## sides return
    port_ret['long'] = trend_trading(df,
                                     start_bt_date,
                                     end_bt_date,
                                     sigs=(20, 50, 150),
                                     transaction_cost=0)
    port_ret['short_trend'] = trend_trading(df,
                                            start_bt_date,
                                            end_bt_date,
                                            sigs=(20, 50, 150),
                                            transaction_cost=0,
                                            direction='short')
    port_ret = port_ret.fillna(0)

    ## short trend side

    plt.rcParams["figure.dpi"] = 800

    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        print(port_ret['short_trend'].sort_values(ascending=False))
    (-port_ret['short_trend'].fillna(0) + 1).cumprod().plot(
        figsize=(5, 3.5), legend=True, lw=0.75, fontsize=10).legend(loc=2)
    plt.ylabel('Cumulative Return')
    plt.savefig("short_trend_follow/short_side_pnl.png")
    plt.close()

    ## ew_eq_curves
    eq_eq_curves = (port_ret + 1).cumprod()

    plt.rcParams["figure.dpi"] = 800

    eq_eq_curves.plot(figsize=(5, 3.5), legend=True, lw=0.75,
                      fontsize=10).legend(loc=2)
    plt.ylabel('Cumulative Return')
    plt.savefig("short_trend_follow/pnls")
    plt.close()

    ## combined Pnl

    plt.rcParams["figure.dpi"] = 800
    combined_PnL = (
        port_ret['long'] -
        port_ret['short_trend']).rename('long_short_trend').to_frame()
    combined_PnL['benchmark'] = (port_ret['long'] - port_ret['short_index'])
    combined_PnL = (combined_PnL.fillna(0) + 1).cumprod()
    combined_PnL.plot(figsize=(5, 3.5), legend=True, lw=0.75,
                      fontsize=10).legend(loc=2)
    plt.ylabel('Cumulative Return')
    plt.savefig("short_trend_follow/combined_pnl")
    plt.close()

    # eq_eq_curves['long'] = (port_ret['long'] +1).cumprod()
    # eq_eq_curves['short'] = (port_ret['short'] +1).cumprod()
    ## performance_analysis

    performance_analysis(eq_eq_curves['long'], eq_eq_curves['short_trend'])

    ## portfolio analysis

    return pd.DataFrame({
        'benchmark':
        port_stats(eq_eq_curves['long'], eq_eq_curves['short_index']),
        'long_short_trend':
        port_stats(eq_eq_curves['long'], eq_eq_curves['short_trend'])
    })
    def check(
        self,
        data_frame,
        basename=None,
        fullpath=None,
        tolerances=None,
        default_tolerance=None,
    ):
        """
        Checks the given pandas dataframe against a previously recorded version, or generate a new file.

        Example::

            data_frame = pandas.DataFrame.from_dict({
                'U_gas': U[0][positions],
                'U_liquid': U[1][positions],
                'gas_vol_frac [-]': vol_frac[0][positions],
                'liquid_vol_frac [-]': vol_frac[1][positions],
                'P': Pa_to_bar(P)[positions],
            })
            dataframe_regression.check(data_frame)

        :param pandas.DataFrame data_frame: pandas DataFrame containing data for regression check.

        :param str basename: basename of the file to test/record. If not given the name
            of the test is used.

        :param str fullpath: complete path to use as a reference file. This option
            will ignore embed_data completely, being useful if a reference file is located
            in the session data dir for example.

        :param dict tolerances: dict mapping keys from the data_dict to tolerance settings for the
            given data. Example::

                tolerances={'U': Tolerance(atol=1e-2)}

        :param dict default_tolerance: dict mapping the default tolerance for the current check
            call. Example::

                default_tolerance=dict(atol=1e-7, rtol=1e-18).

            If not provided, will use defaults from numpy's ``isclose`` function.

        ``basename`` and ``fullpath`` are exclusive.
        """
        try:
            import pandas as pd
        except ModuleNotFoundError:
            raise ModuleNotFoundError(import_error_message("Pandas"))

        import functools

        __tracebackhide__ = True

        assert type(data_frame) is pd.DataFrame, (
            "Only pandas DataFrames are supported on on dataframe_regression fixture.\n"
            "Object with type '%s' was given." % (str(type(data_frame)), ))

        for column in data_frame.columns:
            array = data_frame[column]
            # Skip assertion if an array of strings
            if (array.dtype == "O") and (type(array[0]) is str):
                continue
            # Rejected: timedelta, datetime, objects, zero-terminated bytes, unicode strings and raw data
            assert array.dtype not in [
                "m", "M", "O", "S", "a", "U", "V"
            ], ("Only numeric data is supported on dataframe_regression fixture.\n"
                "Array with type '%s' was given.\n" % (str(array.dtype), ))

        if tolerances is None:
            tolerances = {}
        self._tolerances_dict = tolerances

        if default_tolerance is None:
            default_tolerance = {}
        self._default_tolerance = default_tolerance

        dump_fn = functools.partial(self._dump_fn, data_frame)

        with pd.option_context(*self._pandas_display_options):
            perform_regression_check(
                datadir=self.datadir,
                original_datadir=self.original_datadir,
                request=self.request,
                check_fn=self._check_fn,
                dump_fn=dump_fn,
                extension=".csv",
                basename=basename,
                fullpath=fullpath,
                force_regen=self._force_regen,
            )
pd.read_csv('https://github.com/JamesByers/Datasets/raw/master/drinks.csv',
            nrows=10)  # only read first 10 rows
pd.read_csv('https://github.com/JamesByers/Datasets/raw/master/drinks.csv',
            skiprows=[1, 2])  # skip the first two rows of data

# write a DataFrame out to a CSV
drinks.to_csv('drinks_updated.csv')  # index is used as first column
drinks.to_csv('drinks_updated.csv', index=False)  # ignore index

# save a DataFrame to disk (aka 'pickle') and read it from disk (aka 'unpickle')
drinks.to_pickle('drinks_pickle')
pd.read_pickle('drinks_pickle')

# randomly sample a DataFrame
train = drinks.sample(frac=0.75,
                      random_state=1)  # will contain 75% of the rows
test = drinks[~drinks.index.isin(train.index)]  # will contain the other 25%

# change the maximum number of rows and columns printed ('None' means unlimited)
pd.set_option('max_rows', None)  # default is 60 rows
pd.set_option('max_columns', None)  # default is 20 columns
print drinks

# reset options to defaults
pd.reset_option('max_rows')
pd.reset_option('max_columns')

# change the options temporarily (settings are restored when you exit the 'with' block)
with pd.option_context('max_rows', None, 'max_columns', None):
    print drinks
示例#55
0
from generate_counterfactuals import generate_counterfactuals
from search_utils.Query import Query
from search_utils.Sentence import Sentence

model_config.load("imdb", evalution_model="gpt2")

num = 5
result = []
for wanted_positivity in range(num + 1):
    wanted_positivity = wanted_positivity / num
    wanted_cls = [(1 - wanted_positivity), wanted_positivity]
    max_delta = 50. / num / 100.
    print(f"{wanted_cls[1]}+-{max_delta}")
    # relative high consider_max_words becasue max_delta is small.
    # sent = "A decent story with some thrilling action scenes."
    # sent = "the year's best and most unpredictable comedy."
    sent = "an extremely unpleasant film."
    r = generate_counterfactuals(
        sent, Query(wanted_cls=wanted_cls, max_delta=max_delta))
    print(r.examples[0][0] if len(r.examples) > 0 else "----")
    result.append({
        "y'": f"{wanted_cls[1]:.1f} pm {max_delta:.1f}",
        "y": f"{r.examples[0][0].cls[1]:.2f}",
        "Counterfactual Example x' ": r.examples[0][0].sentence
    })

print("######")
print(f"Original cls {Sentence(sent).calc_sentiment()[1]}")
with pd.option_context("max_colwidth", 1000):
    print(pd.DataFrame(result).to_latex(index=False))
示例#56
0
    def test_dt_namespace_accessor(self):

        # GH 7207, 11128
        # test .dt namespace accessor

        ok_for_base = [
            'year', 'month', 'day', 'hour', 'minute', 'second', 'weekofyear',
            'week', 'dayofweek', 'weekday', 'dayofyear', 'quarter', 'freq',
            'days_in_month', 'daysinmonth', 'is_leap_year'
        ]
        ok_for_period = ok_for_base + ['qyear', 'start_time', 'end_time']
        ok_for_period_methods = ['strftime', 'to_timestamp', 'asfreq']
        ok_for_dt = ok_for_base + [
            'date', 'time', 'microsecond', 'nanosecond', 'is_month_start',
            'is_month_end', 'is_quarter_start', 'is_quarter_end',
            'is_year_start', 'is_year_end', 'tz', 'weekday_name'
        ]
        ok_for_dt_methods = [
            'to_period', 'to_pydatetime', 'tz_localize', 'tz_convert',
            'normalize', 'strftime', 'round', 'floor', 'ceil', 'weekday_name'
        ]
        ok_for_td = ['days', 'seconds', 'microseconds', 'nanoseconds']
        ok_for_td_methods = [
            'components', 'to_pytimedelta', 'total_seconds', 'round', 'floor',
            'ceil'
        ]

        def get_expected(s, name):
            result = getattr(Index(s._values), prop)
            if isinstance(result, np.ndarray):
                if is_integer_dtype(result):
                    result = result.astype('int64')
            elif not is_list_like(result):
                return result
            return Series(result, index=s.index, name=s.name)

        def compare(s, name):
            a = getattr(s.dt, prop)
            b = get_expected(s, prop)
            if not (is_list_like(a) and is_list_like(b)):
                self.assertEqual(a, b)
            else:
                tm.assert_series_equal(a, b)

        # datetimeindex
        cases = [
            Series(date_range('20130101', periods=5), name='xxx'),
            Series(date_range('20130101', periods=5, freq='s'), name='xxx'),
            Series(date_range('20130101 00:00:00', periods=5, freq='ms'),
                   name='xxx')
        ]
        for s in cases:
            for prop in ok_for_dt:
                # we test freq below
                if prop != 'freq':
                    compare(s, prop)

            for prop in ok_for_dt_methods:
                getattr(s.dt, prop)

            result = s.dt.to_pydatetime()
            self.assertIsInstance(result, np.ndarray)
            self.assertTrue(result.dtype == object)

            result = s.dt.tz_localize('US/Eastern')
            exp_values = DatetimeIndex(s.values).tz_localize('US/Eastern')
            expected = Series(exp_values, index=s.index, name='xxx')
            tm.assert_series_equal(result, expected)

            tz_result = result.dt.tz
            self.assertEqual(str(tz_result), 'US/Eastern')
            freq_result = s.dt.freq
            self.assertEqual(freq_result,
                             DatetimeIndex(s.values, freq='infer').freq)

            # let's localize, then convert
            result = s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
            exp_values = (DatetimeIndex(
                s.values).tz_localize('UTC').tz_convert('US/Eastern'))
            expected = Series(exp_values, index=s.index, name='xxx')
            tm.assert_series_equal(result, expected)

        # round
        s = Series(pd.to_datetime([
            '2012-01-01 13:00:00', '2012-01-01 12:01:00', '2012-01-01 08:00:00'
        ]),
                   name='xxx')
        result = s.dt.round('D')
        expected = Series(pd.to_datetime(
            ['2012-01-02', '2012-01-02', '2012-01-01']),
                          name='xxx')
        tm.assert_series_equal(result, expected)

        # round with tz
        result = (
            s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern').dt.round('D'))
        exp_values = pd.to_datetime(['2012-01-01', '2012-01-01',
                                     '2012-01-01']).tz_localize('US/Eastern')
        expected = Series(exp_values, name='xxx')
        tm.assert_series_equal(result, expected)

        # floor
        s = Series(pd.to_datetime([
            '2012-01-01 13:00:00', '2012-01-01 12:01:00', '2012-01-01 08:00:00'
        ]),
                   name='xxx')
        result = s.dt.floor('D')
        expected = Series(pd.to_datetime(
            ['2012-01-01', '2012-01-01', '2012-01-01']),
                          name='xxx')
        tm.assert_series_equal(result, expected)

        # ceil
        s = Series(pd.to_datetime([
            '2012-01-01 13:00:00', '2012-01-01 12:01:00', '2012-01-01 08:00:00'
        ]),
                   name='xxx')
        result = s.dt.ceil('D')
        expected = Series(pd.to_datetime(
            ['2012-01-02', '2012-01-02', '2012-01-02']),
                          name='xxx')
        tm.assert_series_equal(result, expected)

        # datetimeindex with tz
        s = Series(date_range('20130101', periods=5, tz='US/Eastern'),
                   name='xxx')
        for prop in ok_for_dt:

            # we test freq below
            if prop != 'freq':
                compare(s, prop)

        for prop in ok_for_dt_methods:
            getattr(s.dt, prop)

        result = s.dt.to_pydatetime()
        self.assertIsInstance(result, np.ndarray)
        self.assertTrue(result.dtype == object)

        result = s.dt.tz_convert('CET')
        expected = Series(s._values.tz_convert('CET'),
                          index=s.index,
                          name='xxx')
        tm.assert_series_equal(result, expected)

        tz_result = result.dt.tz
        self.assertEqual(str(tz_result), 'CET')
        freq_result = s.dt.freq
        self.assertEqual(freq_result,
                         DatetimeIndex(s.values, freq='infer').freq)

        # timedeltaindex
        cases = [
            Series(timedelta_range('1 day', periods=5),
                   index=list('abcde'),
                   name='xxx'),
            Series(timedelta_range('1 day 01:23:45', periods=5, freq='s'),
                   name='xxx'),
            Series(timedelta_range('2 days 01:23:45.012345',
                                   periods=5,
                                   freq='ms'),
                   name='xxx')
        ]
        for s in cases:
            for prop in ok_for_td:
                # we test freq below
                if prop != 'freq':
                    compare(s, prop)

            for prop in ok_for_td_methods:
                getattr(s.dt, prop)

            result = s.dt.components
            self.assertIsInstance(result, DataFrame)
            tm.assert_index_equal(result.index, s.index)

            result = s.dt.to_pytimedelta()
            self.assertIsInstance(result, np.ndarray)
            self.assertTrue(result.dtype == object)

            result = s.dt.total_seconds()
            self.assertIsInstance(result, pd.Series)
            self.assertTrue(result.dtype == 'float64')

            freq_result = s.dt.freq
            self.assertEqual(freq_result,
                             TimedeltaIndex(s.values, freq='infer').freq)

        # both
        index = date_range('20130101', periods=3, freq='D')
        s = Series(date_range('20140204', periods=3, freq='s'),
                   index=index,
                   name='xxx')
        exp = Series(np.array([2014, 2014, 2014], dtype='int64'),
                     index=index,
                     name='xxx')
        tm.assert_series_equal(s.dt.year, exp)

        exp = Series(np.array([2, 2, 2], dtype='int64'),
                     index=index,
                     name='xxx')
        tm.assert_series_equal(s.dt.month, exp)

        exp = Series(np.array([0, 1, 2], dtype='int64'),
                     index=index,
                     name='xxx')
        tm.assert_series_equal(s.dt.second, exp)

        exp = pd.Series([s[0]] * 3, index=index, name='xxx')
        tm.assert_series_equal(s.dt.normalize(), exp)

        # periodindex
        cases = [
            Series(period_range('20130101', periods=5, freq='D'), name='xxx')
        ]
        for s in cases:
            for prop in ok_for_period:
                # we test freq below
                if prop != 'freq':
                    compare(s, prop)

            for prop in ok_for_period_methods:
                getattr(s.dt, prop)

            freq_result = s.dt.freq
            self.assertEqual(freq_result, PeriodIndex(s.values).freq)

        # test limited display api
        def get_dir(s):
            results = [r for r in s.dt.__dir__() if not r.startswith('_')]
            return list(sorted(set(results)))

        s = Series(date_range('20130101', periods=5, freq='D'), name='xxx')
        results = get_dir(s)
        tm.assert_almost_equal(
            results, list(sorted(set(ok_for_dt + ok_for_dt_methods))))

        s = Series(
            period_range('20130101', periods=5, freq='D', name='xxx').asobject)
        results = get_dir(s)
        tm.assert_almost_equal(
            results, list(sorted(set(ok_for_period + ok_for_period_methods))))

        # 11295
        # ambiguous time error on the conversions
        s = Series(pd.date_range('2015-01-01', '2016-01-01', freq='T'),
                   name='xxx')
        s = s.dt.tz_localize('UTC').dt.tz_convert('America/Chicago')
        results = get_dir(s)
        tm.assert_almost_equal(
            results, list(sorted(set(ok_for_dt + ok_for_dt_methods))))
        exp_values = pd.date_range('2015-01-01',
                                   '2016-01-01',
                                   freq='T',
                                   tz='UTC').tz_convert('America/Chicago')
        expected = Series(exp_values, name='xxx')
        tm.assert_series_equal(s, expected)

        # no setting allowed
        s = Series(date_range('20130101', periods=5, freq='D'), name='xxx')
        with tm.assertRaisesRegexp(ValueError, "modifications"):
            s.dt.hour = 5

        # trying to set a copy
        with pd.option_context('chained_assignment', 'raise'):

            def f():
                s.dt.hour[0] = 5

            self.assertRaises(com.SettingWithCopyError, f)
def compute_portvals(start_date, end_date, orders_file, start_val):
    """Compute daily portfolio value given a sequence of orders in a CSV file.

    Parameters
    ----------
        start_date: first date to track
        end_date: last date to track
        orders_file: CSV file to read orders from
        start_val: total starting cash available

    Returns
    -------
        portvals: portfolio value for each trading day from start_date to end_date (inclusive)
    """
    # TODO: Your code here
    symbols = list()
    rowCount = 0
    startDate = 0
    endDate = 0
    lastRow = []

    reader = csv.reader(open(orders_file, 'rU'), delimiter=',')
    for row in reader:
        if rowCount > 0:
            if rowCount == 1:
                startDate = row[0]
            symbols.append(row[1])
        rowCount += 1
        lastRow = row

    orders = pd.read_csv(orders_file,
                         index_col='Date',
                         parse_dates=True,
                         na_values=['nan'])

    print orders

    endDate = lastRow[0]
    uniqueList = list(set(symbols))

    dates = pd.date_range(start_date, end_date)
    prices_all = get_data(uniqueList, dates)  # automatically adds SPY
    prices_all['CASH'] = 1.0
    ordersDF = prices_all.copy(deep=True)
    for item in uniqueList:
        ordersDF[item] = 0
    ordersDF['CASH'] = 0.0

    ordersDF = ordersDF.drop('SPY', 1)

    for index, row in orders.iterrows():
        shares = row[2]
        if row[1] == 'SELL':
            row[2] = -1 * row[2]
        if index in ordersDF.index:
            testValue = ordersDF.get_value(index, row[0])
            setValue = row[2]
            if testValue:
                setValue = setValue + testValue

            ordersDF.set_value(index, row[0], setValue)

    #print ordersDF
    for index, row in ordersDF.iterrows():
        rowTotal = 0
        for item in uniqueList:
            price = prices_all.get_value(index, item)
            rowTotal += row[item] * price * -1
        #print rowTotal
        ordersDF.set_value(index, 'CASH', rowTotal)

    #print ordersDF
    holdingsDF = ordersDF.copy(deep=True)

    holdingsDF['CASH'] = 0.0

    holdingsDF.set_value(start_date, 'CASH', start_val)

    loopRow = holdingsDF.iterrows()
    holdingsDF.set_value(start_date, 'CASH', start_val)

    prevValue = start_val
    for index, row in loopRow:
        sharesValue = ordersDF.get_value(index, 'CASH')
        holdingsDF.set_value(index, 'CASH', prevValue + sharesValue)
        prevValue = prevValue + sharesValue

    #print holdingsDF

    loopRow = holdingsDF.iterrows()

    previousValues = {}
    for item in uniqueList:
        previousValues[item] = 0

    for index, row in loopRow:
        for item in uniqueList:

            holdingsDF.set_value(index, item, row[item] + previousValues[item])

            previousValues[item] = row[item] + previousValues[item]

    dfValues = holdingsDF.copy(deep=True)

    dfValues = dfValues.drop('CASH', 1)
    dfValues['VALUES'] = 0.0

    loopRow = holdingsDF.iterrows()
    for index, row in loopRow:
        total = 0
        for item in uniqueList:
            holdingsValue = holdingsDF.get_value(index, item)
            pricesValue = prices_all.get_value(index, item)
            total += holdingsValue * pricesValue
        dfValues.set_value(index, 'VALUES', total)
    #print dfValues
    finalDF = dfValues.copy(deep=True)
    for item in uniqueList:
        finalDF = finalDF.drop(item, 1)

    #print dfValues

    finalDF = finalDF.drop('VALUES', 1)
    finalDF['TOTALS'] = 0
    finalDF['TOTALS'] = dfValues['VALUES'] + holdingsDF['CASH']

    with pd.option_context('display.max_rows', 999, 'display.max_columns', 5):
        print finalDF['TOTALS']

    return finalDF['TOTALS']
示例#58
0
def fullDisplay(df,max_rows=None,max_col=None,width=None):
    df_cp = df.style.set_properties( **{'width': f'{width}px'}) if width is not None else df.copy()
    with pd.option_context('display.max_rows', max_rows, 'display.max_columns', max_col,):
        display(df_cp)
示例#59
0
def list_trials(experiment_path,
                sort=None,
                output=None,
                filter_op=None,
                info_keys=None,
                result_keys=None):
    """Lists trials in the directory subtree starting at the given path.

    Args:
        experiment_path (str): Directory where trials are located.
            Corresponds to Experiment.local_dir/Experiment.name.
        sort (str): Key to sort by.
        output (str): Name of file where output is saved.
        filter_op (str): Filter operation in the format
            "<column> <operator> <value>".
        info_keys (list): Keys that are displayed.
        result_keys (list): Keys of last result that are displayed.
    """
    _check_tabulate()
    experiment_state = _get_experiment_state(experiment_path,
                                             exit_on_fail=True)

    checkpoint_dicts = experiment_state["checkpoints"]
    checkpoint_dicts = [flatten_dict(g) for g in checkpoint_dicts]
    checkpoints_df = pd.DataFrame(checkpoint_dicts)

    if not info_keys:
        info_keys = DEFAULT_EXPERIMENT_INFO_KEYS
    if not result_keys:
        result_keys = DEFAULT_RESULT_KEYS
    result_keys = ["last_result:{}".format(k) for k in result_keys]
    col_keys = [
        k for k in list(info_keys) + result_keys if k in checkpoints_df
    ]
    checkpoints_df = checkpoints_df[col_keys]

    if "last_update_time" in checkpoints_df:
        with pd.option_context("mode.use_inf_as_null", True):
            datetime_series = checkpoints_df["last_update_time"].dropna()

        datetime_series = datetime_series.apply(
            lambda t: datetime.fromtimestamp(t).strftime(TIMESTAMP_FORMAT))
        checkpoints_df["last_update_time"] = datetime_series

    if "logdir" in checkpoints_df:
        # logdir often too verbose to view in table, so drop experiment_path
        checkpoints_df["logdir"] = checkpoints_df["logdir"].str.replace(
            experiment_path, "")

    if filter_op:
        col, op, val = filter_op.split(" ")
        col_type = checkpoints_df[col].dtype
        if is_numeric_dtype(col_type):
            val = float(val)
        elif is_string_dtype(col_type):
            val = str(val)
        # TODO(Andrew): add support for datetime and boolean
        else:
            raise ValueError("Unsupported dtype for \"{}\": {}".format(
                val, col_type))
        op = OPERATORS[op]
        filtered_index = op(checkpoints_df[col], val)
        checkpoints_df = checkpoints_df[filtered_index]

    if sort:
        if sort not in checkpoints_df:
            raise KeyError("Sort Index \"{}\" not in: {}".format(
                sort, list(checkpoints_df)))
        checkpoints_df = checkpoints_df.sort_values(by=sort)

    print_format_output(checkpoints_df)

    if output:
        file_extension = os.path.splitext(output)[1].lower()
        if file_extension in (".p", ".pkl", ".pickle"):
            checkpoints_df.to_pickle(output)
        elif file_extension == ".csv":
            checkpoints_df.to_csv(output, index=False)
        else:
            raise ValueError("Unsupported filetype: {}".format(output))
        print("Output saved at:", output)
示例#60
0
def main():

    HistoPlot(year20['W_L_percent'], 10, 'Winning Percentage by Team',
              'Distribution of Team Winning Percentage', False, 0, 7)

    HistoPlot(year20['ERA'], 10, 'Earned Run Average',
              'Distribution of 2020 Team Earned Run Average', False, 0, 7)

    HistoPlot(year20['RBI'], 10, 'Runs Batted In',
              'Distribution of 2020 Team RBIs', False, 0, 6)

    HistoPlot(year20['TB'], 10, 'Total Bases Achieved',
              'Distribution of 2020 Total Bases Achieved by Team', False, 0, 7)

    HistoPlot(year20['TotalRuns'], 10, 'Total Runs Scored by a Team',
              'Total Runs Scored per Team 2020', False, 0, 6)

    HistoPlot(year20['BA'], 10, 'Team Batting Average',
              'Distribution of Team Batting Average for 2020', False, 0, 8)

    HistoPlot(year20['SLG'], 10, 'Total Slugging Percentage',
              'Distribution of Team Slugging Percentage for 2020', False, 0, 6)

    HistoPlot(year20['WHIP'], 10, 'Walks & Hits Per Innining',
              'Walks and Hits per Inning Pitched 2020 Distribution', False, 0,
              7)

    HistoPlot(year20['OBP'], 10, 'On-Base Percentage',
              '2020 On-Base Percentage of Teams Distribution', False, 0, 8)

    # Representation of some Outliers in the Data
    print('2020 Team(s) with an ERA greater than 5.2:')
    print(year20[year20.ERA > 5.2][['Rk', 'Tm']])
    print('\n')

    print('2020 Team(s) with on On-Base Percentage less than .300:')
    print(year20[year20.OBP < .3][['Rk', 'Tm']])
    print('\n')

    print('2020 Team(s) with less than 225 RBIs:')
    print(year20[year20.RBI < 225][['Rk', 'Tm']])
    print('\n')

    print('2020 Team(s) with more than 325 RBIs:')
    print(year20[year20.RBI > 325][['Rk', 'Tm']])
    print('\n')

    print('2020 Team(s) with more than 330 Total Runs Scored:')
    print(year20[year20.TotalRuns > 330][['Rk', 'Tm']])
    print('\n')

    print(
        '2020 Team(s) with less than 1.19 Walks and Hits per Inning Pitched:')
    print(year20[year20.WHIP < 1.19][['Rk', 'Tm']])
    print('\n')

    print('2020 Team(s) with more than 1.5 ')
    print(year20[year20.WHIP > 1.5][['Rk', 'Tm']])
    print('\n')

    print('2020 Team(s) with more than 950 Total Bases Achieved:')
    print(year20[year20.TB > 950][['Rk', 'Tm']])

    print('\n')

    print('Season - 2020:' + '\n')
    print('League rank for the outliers on the "Home Runs" histogram plot,' +
          '\n' + 'Home Runs by a Team greater than 100: ' +
          str(year20[year20.HR_x > 100]['Rk'].values) + '\n')

    print(
        'League rank for the outliers on the "Runs Batted In" histogram plot' +
        '\n' + 'Total Team RBIs less than 225: ' +
        str(year20[year20.RBI < 225]['Rk'].values) + '\n')

    print('League rank for the outliers on the "Total Bases" histogram plot' +
          '\n' + 'Total Bases Completed by a Team greater than 950: ' +
          str(year20[year20.TB > 950]['Rk'].values) + '\n')

    print(
        'League rank for the outliers on the "Earned Run Average" histogram plot'
        + '\n' + 'Team ERA Average greater than 5.2: ' +
        str(year20[year20.ERA > 5.2]['Rk'].values))

    print('\n')
    print('Season - 2019:' + '\n')
    print('League rank for the outliers on the "Home Runs" histogram plot,' +
          '\n' + 'Home Runs by a Team less than 100: ' +
          str(year19[year19.HR_x < 199]['Rk'].values) + '\n')

    print('League rank for the outliers on the "Runs Scored" histogram plot,' +
          '\n' + 'Runs Scored by a Team less than 650: ' +
          str(year19[year19.RunsPerGame < 650]['Rk'].values) + '\n')

    year20['ERA2'] = year20.ERA**2
    year20['ERA3'] = year20.ERA**3

    df_z = year20.select_dtypes(include=[np.number]).dropna().apply(st.zscore)

    formula = 'W_L_percent ~ ERA + RBI + TB'

    year20_model = smf.ols(formula, data=year20)

    result = smf.ols(formula, data=df_z).fit()

    year20_results = year20_model.fit()

    print(year20_results.summary())

    print(result.summary())
    print('\n')

    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        print('Wonders of Rank 14?:' + '\n' + str(year19[
            year19.Rk == 14][['Tm', 'ERA', 'TB', 'WHIP', 'RBI', 'OBP']]))

    print('\n')

    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        print(year19[year19.Tm == 'LgAvg'][[
            'Rk', 'Tm', 'ERA', 'TB', 'WHIP', 'RBI', 'OBP'
        ]])

    print('\n')

    print('2020 Variables Averages between the American and National Leagues:')
    print(year20[['Lg', 'TotalRuns', 'BA', 'ERA', 'OBP']].groupby('Lg').mean())

    iloc_numbers1 = [6, 8, 9, 20, 21, 24, 26]

    iloc_numbers2 = [6, 27, 28, 29, 30, 33, 44]

    iloc_numbers3 = [6, 52, 67, 71, 72, 76, 84, 86]

    ComparingPlot(
        'W_L_percent', 10,
        'Distribution of Winning Percentage from the 2017-2020 Seasons',
        'Winning %', 'Winning Percentage (%)', 0.05, 0.2)

    ComparingPlot('ERA', 10,
                  'Distribution of Earned Run Average, 2017-2020 Seasons',
                  'ERA', 'Earned Run Average', 0.1, 0.1)

    ComparingPlot(
        'BA', 10, 'Comparing Team Batting Averages from the 2017-2020 Seasons',
        'Batting Average', 'Team Batting Average', 0.1, 0.05)

    CorrelationMatrix(year20, iloc_numbers1)
    print('\n')

    CorrelationMatrix(year20, iloc_numbers2)
    print('\n')

    CorrelationMatrix(year20, iloc_numbers3)
    print('\n')

    ALvsNLcomp20()
    print('\n')

    print('\n')
    print('2020 AL Slugging Percentage Average: ' +
          str(round(np.mean(AL20['SLG']), 3)))
    print('2020 NL Slugging Percentage Average: ' +
          str(round(np.mean(NL20['SLG']), 3)))
    print('\n')
    print('2020 AL Earned Run Average: ' + str(round(np.mean(AL20['ERA']), 3)))
    print('2020 NL Earned Run Average: ' + str(round(np.mean(NL20['ERA']), 3)))
    print('\n')

    ALvsNLcomp19()
    print('\n')

    print('2019 AL Slugging Percentage Average: ' +
          str(round(np.mean(AL19['SLG']), 3)))
    print('2019 NL Slugging Percentage Average: ' +
          str(round(np.mean(NL19['SLG']), 3)))
    print('\n')
    print('2019 AL Earned Run Average: ' + str(round(np.mean(AL19['ERA']), 3)))
    print('2019 NL Earned Run Average: ' + str(round(np.mean(NL19['ERA']), 3)))
    print('\n')

    TestCDF('WHIP', 'Walks & Hits (per Inning Pitched)',
            'Cumulative Distribution Function of WHIP by Team')

    TestCDF('BA', 'Team Batting Average',
            'Cumulative Distribution Function of Team Batting Average')

    TestCDF('SLG', 'Slugging Percentage (% per Team)',
            'Cumulative Distribution Function of Team Slugging Percentage')

    TestCDF('ERA', 'Opponent Earned Runs Average (per 9 Innings)',
            'Cumulative Distribution Function of Earned Run Average')

    TestCDF('OBP', 'On-Base Percentage (%)',
            'Cumulative Distribution Function of On-Base Percentage by Team')

    TestCDF('RunsPGame', 'Runs Per Game',
            'Cumulative Distribution Function of Runs Per Game')

    TestCDF('SO_x', 'Strikeouts',
            'Cumulative Distribution Function of Runs Per Game')

    OPS_cdf(log=False)
    print('\n')
    OPS_cdf(log=True)
    print('\n')

    ALvsNL('ERA')
    print('\n')

    ALvsNL('SLG')
    print('\n')

    ALvsNL('OBP')
    print('\n')

    ALvsNL('BA')
    print('\n')

    ALvsNL('WHIP')
    print('\n')

    ALvsNL('SO9')
    print('\n')

    ALvsNL('TB')
    print('\n')

    ALvsNL('SO_x')
    print('\n')

    era17_19 = year19['ERA'].append(year18['ERA']).append(year17['ERA'])

    year_era_test = st.ttest_ind(year20['ERA'], era17_19, equal_var=False)

    ba17_19 = year19['BA'].append(year18['BA']).append(year17['BA'])

    year_ba_test = st.ttest_ind(year20['BA'], ba17_19, equal_var=False)

    slg17_19 = year19['SLG'].append(year18['SLG']).append(year17['SLG'])

    year_slg_test = st.ttest_ind(year20['SLG'], slg17_19, equal_var=False)

    print('BA')
    print(year_ba_test)
    print('\n')
    print('SLG')
    print(year_slg_test)
    print('\n')
    print('ERA')
    print(year_era_test)
    print('\n')

    # Comparing

    Comparing2020('BA')
    print('\n')

    Comparing2020('SLG')
    print('\n')

    Comparing2020('ERA')
    print('\n')

    Comparing2020('WHIP')
    print('\n')

    # Correlation Plot and Calculations between variables
    ScatterPlot('SO9', 'W_L_percent',
                'Importance of Strikeouts and Winning Percentage',
                'Strikeouts (per 9 Innings)', 'Winning Percentage (%)')
    print('\n')

    ScatterPlot('RunsPGame', 'W_L_percent', 'Score Runs and Winning Games',
                'Runs Scored (per Game)', 'Winning Percentage (%)')
    print('\n')

    ScatterPlot('OBP', 'W_L_percent',
                'Comparison between On-Base Percentage and Winning',
                'On-Base Percentage (%)', 'Winning Percentage (%)')
    print('\n')

    ScatterPlot('ERA', 'W_L_percent',
                'Pitchers Abilites to Win Games (ERA vs Winning)',
                'Earned Run Average', 'Winning Percentage (%)')
    print('\n')

    ScatterPlot('ERA', 'SO9',
                'Pitchers Abilites to Win Games (ERA vs Winning)',
                'Earned Run Average', 'Winning Percentage (%)')
    print('\n')

    ScatterPlot('ERA', 'WHIP',
                'Pitchers Abilites to Win Games (ERA vs Winning)',
                'Earned Run Average', 'Winning Percentage (%)')
    print('\n')

    ScatterPlot('WHIP', 'SO9',
                'Pitchers Abilites to Win Games (ERA vs Winning)',
                'Earned Run Average', 'Winning Percentage (%)')
    print('\n')

    ScatterPlot('TB', 'W_L_percent',
                'Pitchers Abilites to Win Games (ERA vs Winning)',
                'Earned Run Average', 'Winning Percentage (%)')
    print('\n')

    ScatterPlot('BA', 'W_L_percent', 'Comparing Batting Average and Winning %',
                'Batting Average', 'Winning Percentage (%)')
    print('\n')

    ScatterPlot('WHIP', 'W_L_percent', 'WHIP vs Winning Percentage',
                'Walks & Hits per Inning Pitched', 'Winning Percentage (%)')
    print('\n')

    ScatterPlot('RunsPGame', 'OBP', 'On-Base Percentage\'s Influence on Runs',
                'Runs Scored (per Game)', 'On-Base Percentage (%)')