Exemplo n.º 1
0
    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6), index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6), index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names
Exemplo n.º 2
0
    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]],
                                            names=['abc', 'xy', 'num'])
        left = DataFrame({'v1': range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product([list('abc'), list('xy')],
                                             names=['abc', 'xy'])
        right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=['abc', 'xy'], how=join_type)
        expected = (left.reset_index()
                        .merge(right.reset_index(),
                               on=['abc', 'xy'], how=join_type)
                        .set_index(['abc', 'xy', 'num'])
                    )
        assert_frame_equal(expected, result)

        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            left.join(right, on='xy', how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=['abc', 'xy'], how=join_type)
Exemplo n.º 3
0
def plots_workingTrends():

	# holiday = 0 and workday = 0 => weekend
	# let's see if holidays and weekends give the same trends

	# Day trends -- working vs. non-working day
	hours = np.linspace(0,23,24)

	days_average = DataFrame({'Hour': hours})

	# workdays
	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["workingday"] == 1) & (bike_data["time"] == hour) ].mean()['count'])
	days_average = days_average.join(DataFrame({'Working day': mean_vec}))

	# holidays or weekends
	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["workingday"] == 0) & (bike_data["time"] == hour) ].mean()['count'])
	days_average = days_average.join(DataFrame({'Non-working day': mean_vec}))

	days_average.drop('Hour',axis=1).plot(figsize=(12, 6), linewidth=3, fontsize=16)
	plt.xlabel('Hour', fontsize=16)
	plt.ylabel('Average counts', fontsize=16)
	plt.legend(loc='best', fontsize=16)
	plt.show()
Exemplo n.º 4
0
    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)
Exemplo n.º 5
0
 def test_join_segfault(self):
     # 1532
     df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]})
     df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]})
     df1 = df1.set_index(['a', 'b'])
     df2 = df2.set_index(['a', 'b'])
     # it works!
     for how in ['left', 'right', 'outer']:
         df1.join(df2, how=how)
Exemplo n.º 6
0
    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)
Exemplo n.º 7
0
class JoinIndex(object):

    def setup(self):
        N = 50000
        self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)),
                              columns=['jim', 'joe'])
        self.right = DataFrame(np.random.randint(1, N / 500, (N, 2)),
                               columns=['jolie', 'jolia']).set_index('jolie')

    def time_left_outer_join_index(self):
        self.left.join(self.right, on='jim')
Exemplo n.º 8
0
    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notna()]
        tm.assert_series_equal(joined['key'], expected['key'],
                               check_dtype=False)
        tm.assert_series_equal(joined['value'], expected['value'],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)
Exemplo n.º 9
0
def merge_with_technicals(currency_list, returns_table, fundamentals_table, RSI, MACD, Stochastics, beg_date, stoch_date):
	# Create empty list, will hold dataframes for all currencies
	dataframe_list = []
	for currency in currency_list:
		buildup_dataframe = DataFrame(returns_table[currency])
		buildup_dataframe = buildup_dataframe.join(fundamentals_table, how= 'left', rsuffix= '')
		buildup_dataframe = buildup_dataframe.join(RSI[currency], how= 'left', rsuffix= '_RSI')
		buildup_dataframe = buildup_dataframe.join(MACD[currency], how='left', rsuffix='_MACD')
		if beg_date > stoch_date:
			buildup_dataframe = buildup_dataframe.join(Stochastics[currency], how='left', rsuffix='_Stoch')
		dataframe_list.append(buildup_dataframe)

	return dataframe_list
Exemplo n.º 10
0
def read_data(test = False):

    if (test):
        filename = 'test.csv'
    else:
        filename = 'train.csv'
    
    # read data; output: dataframe
    data = pd.read_csv(filename)

    # split datetime into date and time
    date = []
    time = []
    for row in data['datetime']:
        row = row.split()
        date.append(row[0])
        time.append(int(row[1].split(':')[0]))

    date_and_time = DataFrame({'date': date,
                               'time': time})

    del data['datetime']
    data = date_and_time.join(data)

    # add day of the week
    day = []
    # https://docs.python.org/2/library/datetime.html
    # .strftime('%A') -- sets proper format
    for row in data['date']:
        day.append(datetime.datetime.strptime(row, '%Y-%m-%d').strftime('%A'))

    data = DataFrame({'day': day}).join(data)
    
    # split date into year | month | dayMonth
    year = []
    month = []
    dayMonth = []
    for row in data['date']:
        row = row.split('-')
        year.append(int(row[0]))
        month.append(int(row[1]))
        dayMonth.append(int(row[2]))

    year_month_day = DataFrame({'year' : year,
                                'month': month,
                                'dayMonth' : dayMonth})

    del data['date']
    data = year_month_day.join(data)
    
    return data
Exemplo n.º 11
0
    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame(
            {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                        'C': np.random.randn(8),
                        'D': np.random.randn(8)})
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2), name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)
Exemplo n.º 12
0
def parse_GDS_columns(lines, subsets):
    """Parse list of line with columns description from SOFT file
    of GDS (GEO Dataset)

    :param lines: iterable -- iterator over lines
    :returns: pandas.DataFrame -- columns description

    """
    data = []
    index = []
    for line in lines:
        line = line.rstrip()
        if line.startswith("#"):
            tmp = __parse_entry(line)
            data.append(tmp[1])
            index.append(tmp[0])

    df = DataFrame(data, index=index, columns=['description'])
    subset_ids = {"disease_state": {}, "individual": {}}
    for subsetname, subset in subsets.iteritems():
        for expid in subset.metadata["sample_id"][0].split(","):
            if subset.get_type() == "disease state":
                subset_ids["disease_state"][expid] = subset.metadata["description"][0]
            elif subset.get_type() == "individual":
                subset_ids["individual"][expid] = subset.metadata["description"][0]
            else:
                stderr("Unknown subset type: %s for subset %s\n" % (subset.get_type(), subsetname))

    return df.join(DataFrame(subset_ids))
Exemplo n.º 13
0
    def test_join_sort(self):
        left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
                          'value': [1, 2, 3, 4]})
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
                              'value': [2, 3, 1, 4],
                              'value2': ['a', 'b', 'c', 'c']},
                             index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        tm.assert_index_equal(joined.index, pd.Index(lrange(4)))
Exemplo n.º 14
0
    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame({'a': [1, 2, 3, 3, 4],
                              'b': [5, np.nan, 6, 7, np.nan]},
                             index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]},
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)
Exemplo n.º 15
0
    def to_dataframe(self, selected_fields=None, excluded_fields=None):
        from ..services import locations

        if excluded_fields:
            qs = self.exclude(*excluded_fields)
        else:
            qs = self.exclude(*self.DEFAULT_EXCLUDED_FIELDS)
        if selected_fields:
            qs = self.only(*selected_fields)

        df = DataFrame(list(qs.as_pymongo())).convert_objects(convert_numeric=True)
        if df.empty:
            return df

        # add fields with no values
        fields = filter(
            lambda f: f not in df.columns,
            map(lambda field: field.name, [field for group in self.first().form.groups for field in group.fields]),
        )

        for field in fields:
            df[field] = Series(np.nan, index=df.index)

        # do cleanup of subdocument fields
        for field in self.SUBDOCUMENT_FIELDS:
            temp = df.pop(field).tolist()
            temp2 = [i if not isnull(i) else {} for i in temp]
            df = df.join(DataFrame(temp2))

        rv_map = locations.registered_voters_map()

        df["registered_voters"] = df.location.apply(lambda i: rv_map.get(i, 0))

        return df
Exemplo n.º 16
0
def foreach_dataframe(self, func, force_dict=False, *args, **kwargs):
    """
        Really just does a foreach with each being dfs in a panel. 
    """
    d = {}
    for key, df in self.items():
        d[key] = func(df, *args, **kwargs)
    container = PanelDict
    for key, result in list(d.items()):
        if isinstance(result, Series):
            container = DataFrame
            break
        if isinstance(result, DataFrame):
            container = Panel
            break

    index = []
    for key, result in list(d.items()):
        if not isinstance(result, (DataFrame, Series)):
            continue
        result.name = key
        ind = result.index
        index = set(index).union(ind) 

    if force_dict:
        return PanelDict(d)

    res = DataFrame(None, index=index)
    for key, result in list(d.items()):
        res = res.join(result)

    res = res.sort()
    return res
Exemplo n.º 17
0
    def test_join_aware(self):
        rng = date_range('1/1/2011', periods=10, freq='H')
        ts = Series(np.random.randn(len(rng)), index=rng)

        ts_utc = ts.tz_localize('utc')

        self.assertRaises(Exception, ts.__add__, ts_utc)
        self.assertRaises(Exception, ts_utc.__add__, ts)

        test1 = DataFrame(np.zeros((6,3)),
                          index=date_range("2012-11-15 00:00:00", periods=6,
                                           freq="100L", tz="US/Central"))
        test2 = DataFrame(np.zeros((3,3)),
                          index=date_range("2012-11-15 00:00:00", periods=3,
                                           freq="250L", tz="US/Central"),
                          columns=range(3,6))

        result = test1.join(test2, how='outer')
        ex_index = test1.index.union(test2.index)

        self.assertTrue(result.index.equals(ex_index))
        self.assertTrue(result.index.tz.zone == 'US/Central')

        # non-overlapping
        rng = date_range("2012-11-15 00:00:00", periods=6,
                         freq="H", tz="US/Central")

        rng2 = date_range("2012-11-15 12:00:00", periods=6,
                         freq="H", tz="US/Eastern")

        result = rng.union(rng2)
        self.assertTrue(result.tz.zone == 'UTC')
Exemplo n.º 18
0
 def saveGrid(self,output):
     arq = open(output+'.txt', "w")
     arq.write(self.output)
     arq.close()
     dfCoulomb = DataFrame(self.coulombMatrix, columns = self.cCoulomb, index = self.molecules)
     dfLj = DataFrame(self.ljMatrix, columns = self.cLJ, index = self.molecules)
     df = dfCoulomb.join(dfLj)
     df.to_csv(output+'.csv', sep =';')
Exemplo n.º 19
0
    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        assert_frame_equal(joined, expected)
Exemplo n.º 20
0
    def test_left_join_index_preserve_order(self):

        on_cols = ['k1', 'k2']
        left = DataFrame({'k1': [0, 1, 2] * 8,
                          'k2': ['foo', 'bar'] * 12,
                          'v': np.array(np.arange(24), dtype=np.int64)})

        index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
        right = DataFrame({'v2': [5, 7]}, index=index)

        result = left.join(right, on=on_cols)

        expected = left.copy()
        expected['v2'] = np.nan
        expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
        expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7

        tm.assert_frame_equal(result, expected)

        result.sort_values(on_cols, kind='mergesort', inplace=True)
        expected = left.join(right, on=on_cols, sort=True)

        tm.assert_frame_equal(result, expected)

        # test join with multi dtypes blocks
        left = DataFrame({'k1': [0, 1, 2] * 8,
                          'k2': ['foo', 'bar'] * 12,
                          'k3': np.array([0, 1, 2] * 8, dtype=np.float32),
                          'v': np.array(np.arange(24), dtype=np.int32)})

        index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
        right = DataFrame({'v2': [5, 7]}, index=index)

        result = left.join(right, on=on_cols)

        expected = left.copy()
        expected['v2'] = np.nan
        expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
        expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7

        tm.assert_frame_equal(result, expected)

        result = result.sort_values(on_cols, kind='mergesort')
        expected = left.join(right, on=on_cols, sort=True)

        tm.assert_frame_equal(result, expected)
Exemplo n.º 21
0
    def dataframe(self):
	tss   = self.eval()
	df  = DataFrame()
	# FIXME: should do something about potential for dupe names
	for ts,h in zip(tss, self.hidden):
	    if not h and type(ts) != type(''):
		df = df.join(ts,how='outer')
	return df		 
Exemplo n.º 22
0
    def test_left_join_index_multi_match(self):
        left = DataFrame([
            ['c', 0],
            ['b', 1],
            ['a', 2],
            ['b', 3]],
            columns=['tag', 'val'],
            index=[2, 0, 1, 3])

        right = (DataFrame([
            ['a', 'v'],
            ['c', 'w'],
            ['c', 'x'],
            ['d', 'y'],
            ['a', 'z'],
            ['c', 'r'],
            ['e', 'q'],
            ['c', 's']],
            columns=['tag', 'char'])
            .set_index('tag'))

        result = left.join(right, on='tag', how='left')

        expected = DataFrame([
            ['c', 0, 'w'],
            ['c', 0, 'x'],
            ['c', 0, 'r'],
            ['c', 0, 's'],
            ['b', 1, nan],
            ['a', 2, 'v'],
            ['a', 2, 'z'],
            ['b', 3, nan]],
            columns=['tag', 'val', 'char'],
            index=[2, 2, 2, 2, 0, 1, 1, 3])

        tm.assert_frame_equal(result, expected)

        result = left.join(right, on='tag', how='left', sort=True)
        expected2 = expected.sort_values('tag', kind='mergesort')

        tm.assert_frame_equal(result, expected2)

        # GH7331 - maintain left frame order in left merge
        result = merge(left, right.reset_index(), how='left', on='tag')
        expected.index = np.arange(len(expected))
        tm.assert_frame_equal(result, expected)
Exemplo n.º 23
0
 def test_join_on_series_buglet(self):
     # GH #638
     df = DataFrame({'a': [1, 1]})
     ds = Series([2], index=[1], name='b')
     result = df.join(ds, on='a')
     expected = DataFrame({'a': [1, 1],
                           'b': [2, 2]}, index=df.index)
     tm.assert_frame_equal(result, expected)
def encode_onehot(df: pd.DataFrame, cols):
    vec = DictVectorizer()
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(outtype='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index

    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df
Exemplo n.º 25
0
    def test_join_str_datetime(self):
        str_dates = ['20120209', '20120222']
        dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]

        A = DataFrame(str_dates, index=lrange(2), columns=['aa'])
        C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)

        tst = A.join(C, on='aa')

        assert len(tst.columns) == 3
Exemplo n.º 26
0
def get_results_df(db, rev):
    """Takes a git commit hash and returns a Dataframe of benchmark results
    """
    bench = DataFrame(db.get_benchmarks())
    results = DataFrame(map(list,db.get_rev_results(rev).values()))

    # Sinch vbench.db._reg_rev_results returns an unlabeled dict,
    # we have to break encapsulation a bit.
    results.columns = db._results.c.keys()
    results = results.join(bench['name'], on='checksum').set_index("checksum")
    return results
def runnig_check():
	result = DataFrame()
	tmp = DataFrame()
	for i in range(0,3):
		if i == 0:
			result = make_keti_data_to_df(i)
		else:
			tmp = result
			result = tmp.join(make_keti_data_to_df(i))
		time.sleep(2)
	return result
Exemplo n.º 28
0
Arquivo: tank.py Projeto: cpcloud/span
    def _read_tsq(self, event_name):
        """Read the metadata (TSQ) file of a TDT Tank.

        Returns
        -------
        b : pandas.DataFrame
            Recording metadata
        """
        # create the path name
        tsq_name = self.path + os.extsep + self.header_ext

        # read in the raw data as a numpy rec array and convert to DataFrame
        b = DataFrame(np.fromfile(tsq_name, dtype=self.tsq_dtype))

        # zero based indexing
        b.channel -= 1
        b.channel = b.channel.astype(f8)

        # -1s are invalid
        b.channel[b.channel == -1] = np.nan

        b.type = EventTypes[b.type].reset_index(drop=True)
        b.format = DataTypes[b.format].reset_index(drop=True)

        b.timestamp[np.logical_not(b.timestamp)] = np.nan
        b.fs[np.logical_not(b.fs)] = np.nan

        # fragile subtraction (i.e., what if TDT changes this value?)
        b.size -= 10

        # create some new indices based on the electrode array
        srt = Indexer.sort('channel').reset_index(drop=True)
        shank = srt.shank[b.channel].reset_index(drop=True)

        tsq = b.join(shank)

        # convert the event_name to a number
        name = name2num(event_name)

        # get the row of the metadata where its value equals the name-number
        row = tsq.name == name

        # make sure there's at least one event
        assert row.any(), 'no event named %s in tank: %s' % (event_name,
                                                             self.path)

        # get all the metadata for those events
        tsq = tsq[row]

        # convert to integer where possible
        tsq.channel = tsq.channel.astype(int)
        tsq.shank = tsq.shank.astype(int)

        return tsq, row
Exemplo n.º 29
0
def plots_casRegTrends():

	hours = np.linspace(0,23,24)
	days_average = DataFrame({'Hour': hours})

	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["time"] == hour) ].mean()['casual'])
	days_average = days_average.join(DataFrame({'Casual': mean_vec}))

	mean_vec = []
	for hour in hours:
		mean_vec.append(bike_data[ (bike_data["time"] == hour) ].mean()['registered'])
	days_average = days_average.join(DataFrame({'Registered': mean_vec}))

	days_average.drop('Hour',axis=1).plot(figsize=(12, 6), linewidth=3, fontsize=16)
	plt.xlabel('Hour', fontsize=16)
	plt.ylabel('Average counts', fontsize=16)
	plt.legend(loc='best', fontsize=16)
	plt.show()
Exemplo n.º 30
0
 def test_join_non_unique_period_index(self):
     # GH #16871
     index = pd.period_range('2016-01-01', periods=16, freq='M')
     df = DataFrame([i for i in range(len(index))],
                    index=index, columns=['pnum'])
     df2 = concat([df, df])
     result = df.join(df2, how='inner', rsuffix='_df2')
     expected = DataFrame(
         np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
         columns=['pnum', 'pnum_df2'], index=df2.sort_index().index)
     tm.assert_frame_equal(result, expected)
Exemplo n.º 31
0
#scaler = MinMaxScaler(feature_range=(0, 1))
#scaler2 = MinMaxScaler(feature_range=(0, 1))
scale_X =df.loc[:,["Daily_data","Hourly_data","Monthly_data","Pre_year_data"]]
scale_Y =df.loc[:,["Label_year_data"]]
scalerX = scaler.fit(scale_X)
scalery = scaler.fit(scale_Y)

scaled_X = scalerX.transform(scale_X)
scaled_X = DataFrame(scaled_X)
scaled_X.columns=["Daily_data","Hourly_data","Monthly_data","Pre_year_data"]
scaled_Y = scalery.transform(scale_Y)
scaled_Y = DataFrame(scaled_Y)
scaled_Y.columns=["Label_year_data"]

###adding time sig and cos
x = scaled_X.join(df.loc[:,["Hour","Month"]])
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data
#x = encode(x, 'Hour', 23)
x=encode(x,"Month",12)
x=x.drop(["Hour"],axis=1)
x=x.drop(["Month"],axis=1)
#train_x, test_x ,X_val= x[:(len(scaled_X)-n_val-n_test)], x[-n_test:], x[n_val:(n_test+n_val)]

train_x, test_x ,X_val= scaled_X[:(len(scaled_X)-n_val-n_test)], scaled_X[-n_test:], scaled_X[n_val:(n_test+n_val)]
train_y, test_y,y_val = scaled_Y[:(len(scaled_X)-n_val-n_test)], scaled_Y[-n_test:],scaled_Y[n_val:(n_test+n_val)]


print(train_x.shape)
Exemplo n.º 32
0
def _extracting_coordinates(dataframe: pd.DataFrame) -> pd.DataFrame:
    expanded_cols = pd.DataFrame(dataframe['coordenadas'].values.tolist(),
                                 columns=['latitude', 'longitude'])

    return dataframe.join(expanded_cols).drop('coordenadas', axis=1)
Exemplo n.º 33
0
def generate_onehot_encoding(data: pd.DataFrame, column_name: str, drop=True):
    onehot_repr = pd.get_dummies(data[column_name])
    data = data.join(onehot_repr)
    data.drop(column_name, axis=1, inplace=True)
    return data
Exemplo n.º 34
0
                   columns=['event1', 'event2'])

lefth
righth
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

left2 = DataFrame([[1, 2], [3, 4], [5, 6]],
                  index=['a', 'c', 'e'],
                  columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7, 8], [9, 10], [11, 12], [13, 14]],
                   index=['b', 'c', 'd', 'e'],
                   columns=['Missouri', 'Alabma'])
right2
left2
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
left2.join(right2, how='outer')  #join 메서드는 칼럼이 켭치지 않고 완전히 같거나 유사한 색인구조 통합
left1.join(right1, on='key')
another = DataFrame([[7, 8], [9, 10], [11, 12], [16, 17]],
                    index=['a', 'c', 'e', 'f'],
                    columns=['New York', 'Oregon'])
another
left2.join([right2, another])
right2
left2
left2.join([right2, another], how='outer')
'''
합치기전에 고려해야 할 사항
1. 만약 연결하려는 두객체의 색인이 서로 다르다면, 교집합? 합집합 ?
2. 합쳐진 결과에서 합쳐지기전 객체의 데이터를 고려할 수 있음 ?
3. 어떤 축으로 연결할거임?
'''
Exemplo n.º 35
0
    def test_join_inner_multiindex(self):
        key1 = [
            "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux",
            "snap"
        ]
        key2 = [
            "two",
            "one",
            "three",
            "one",
            "two",
            "one",
            "two",
            "two",
            "three",
            "one",
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({"key1": key1, "key2": key2, "data": data})

        index = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=["j_one", "j_two", "j_three"])

        joined = data.join(to_join, on=["key1", "key2"], how="inner")
        expected = merge(
            data,
            to_join.reset_index(),
            left_on=["key1", "key2"],
            right_on=["first", "second"],
            how="inner",
            sort=False,
        )

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )
        tm.assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )

        expected = expected.drop(["first", "second"], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        tm.assert_frame_equal(joined, expected)
Exemplo n.º 36
0
sqft_get_data = [xx for xx in sqft_cursor]
sqft_cursor.close()

# Getting city data in dataframe 
city_data_for_join_df = DataFrame (city_get_data)
city_data_for_join_df.columns = city_field_names

city_df=DataFrame(city_get_data)
city_df.columns = city_field_names

# Getting city sqft data in dataframe
sqft_df = DataFrame(sqft_get_data)
sqft_df.columns = sqft_field_names

# joining city and city sqft data frame
joined_city_sqft=city_data_for_join_df.join(sqft_df.set_index('CityCode'), on='CityCode')

# Transposing the data
master_melted_dataset_df=pandas.melt(joined_city_sqft, id_vars=["CityCode","CityName","Metro","County","State","PopulationRank"])

#Question3
print("Question 3")
full_average=master_melted_dataset_df["value"].mean()
print("Average of Price Sqft Dataset")
print(full_average, "\n")
print("Maximum of Price Sqft Dataset")
print(master_melted_dataset_df["value"].max(), "\n")
print("Minimum of Price Sqft Dataset")
print(master_melted_dataset_df["value"].min(), "\n")

#Question4
Exemplo n.º 37
0
    def _read_one_data(self, url, params):
        """ read one data from specified symbol """

        symbol = params['symbol']
        del params['symbol']
        url = url.format(symbol)

        resp = self._get_response(url, params=params)
        ptrn = r'root\.App\.main = (.*?);\n}\(this\)\);'
        try:
            j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1))
            data = j['context']['dispatcher']['stores']['HistoricalPriceStore']
        except KeyError:
            msg = 'No data fetched for symbol {} using {}'
            raise RemoteDataError(msg.format(symbol, self.__class__.__name__))

        # price data
        prices = DataFrame(data['prices'])
        prices.columns = [col.capitalize() for col in prices.columns]
        prices['Date'] = to_datetime(
            to_datetime(prices['Date'], unit='s').dt.date)

        if 'Data' in prices.columns:
            prices = prices[prices['Data'].isnull()]
        prices = prices[[
            'Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Adjclose'
        ]]
        prices = prices.rename(columns={'Adjclose': 'Adj Close'})

        prices = prices.set_index('Date')
        prices = prices.sort_index().dropna(how='all')

        if self.ret_index:
            prices['Ret_Index'] = \
                _calc_return_index(prices['Adj Close'])
        if self.adjust_price:
            prices = _adjust_prices(prices)

        # dividends & splits data
        if self.get_actions and data['eventsData']:

            actions = DataFrame(data['eventsData'])
            actions.columns = [col.capitalize() for col in actions.columns]
            actions['Date'] = to_datetime(
                to_datetime(actions['Date'], unit='s').dt.date)

            types = actions['Type'].unique()
            if 'DIVIDEND' in types:
                divs = actions[actions.Type == 'DIVIDEND'].copy()
                divs = divs[['Date', 'Amount']].reset_index(drop=True)
                divs = divs.set_index('Date')
                divs = divs.rename(columns={'Amount': 'Dividends'})
                prices = prices.join(divs, how='outer')

            if 'SPLIT' in types:
                splits = actions[actions.Type == 'SPLIT'].copy()
                splits['SplitRatio'] = splits['Splitratio'].apply(
                    lambda x: eval(x))
                splits = splits.reset_index(drop=True)
                splits = splits.set_index('Date')
                splits['Splits'] = 1.0 / splits['SplitRatio']
                prices = prices.join(splits['Splits'], how='outer')

                if 'DIVIDEND' in types and not self.adjust_dividends:
                    # Adjust dividends to deal with splits
                    adj = prices['Splits'].sort_index(
                        ascending=False).fillna(1).cumprod()
                    adj = 1.0 / adj
                    prices['Dividends'] = prices['Dividends'] * adj

        return prices
Exemplo n.º 38
0
def options_to_rates(options, t_min=1. / 12., n_min=6):
    """
    Extract implied risk-free rates and dividend yield from
    standard European option quote file.

    ignore data:
    - with time to maturity < tMin (in fraction of years)
    - with fewer than nMin quotes per maturity date

    Parameters
    ----------

    t_min: float (default: 1 month)
        Minimum time to maturity in fraction of years
    n_min: int (default: 6)
        minimum number of quotes per maturity date

    """

    grouped = options.groupby(nm.EXPIRY_DATE)

    expiry_dates = []
    implied_interest_rates = []
    implied_dividend_yields = []

    for spec, group in grouped:
        # implied vol for this type/expiry group

        index = group.index

        trade_date = group[nm.TRADE_DATE][index[0]]
        expiry_date = group[nm.EXPIRY_DATE][index[0]]
        spot = group[nm.SPOT][index[0]]
        days_to_expiry = (expiry_date - trade_date).days
        time_to_maturity = days_to_expiry / 365.0

        # exclude groups with too short time to maturity
        if time_to_maturity < t_min:
            continue

        # extract the put and call quotes
        calls = group[group[nm.OPTION_TYPE] == nm.CALL_OPTION]
        puts = group[group[nm.OPTION_TYPE] == nm.PUT_OPTION]

        # exclude groups with too few data points
        if (len(calls) < n_min) | (len(puts) < n_min):
            continue

        # calculate forward, implied interest rate and implied div. yield
        call_premium = DataFrame(
            (calls[nm.PRICE_BID] + calls[nm.PRICE_ASK]) / 2.,
            columns=[CALL_PREMIUM])
        call_premium.index = np.array(calls[nm.STRIKE])

        put_premium = DataFrame((puts[nm.PRICE_BID] + puts[nm.PRICE_ASK]) / 2.,
                                columns=[PUT_PREMIUM])
        put_premium.index = np.array(puts[nm.STRIKE])

        # use 'inner' join because some strikes are not quoted for C and P
        all_quotes = call_premium.join(put_premium, how='inner')
        all_quotes[nm.STRIKE] = all_quotes.index
        all_quotes['C-P'] = all_quotes[CALL_PREMIUM] - all_quotes[PUT_PREMIUM]

        y = np.array(all_quotes['C-P'])
        x = np.array(all_quotes[nm.STRIKE])
        A = np.vstack([x, np.ones(len(x))]).T
        a_1, a_0 = np.linalg.lstsq(A, y)[0]

        # intercept is last coef
        interest_rate = -np.log(-a_1) / time_to_maturity
        dividend_yield = np.log(spot / a_0) / time_to_maturity

        implied_interest_rates.append(interest_rate)
        implied_dividend_yields.append(dividend_yield)
        expiry_dates.append(expiry_date)

    rates = ds.riskfree_dividend_template().reindex(index=expiry_dates)
    rates[nm.INTEREST_RATE] = implied_interest_rates
    rates[nm.DIVIDEND_YIELD] = implied_dividend_yields

    return rates
Exemplo n.º 39
0
def convert_amenities(df: pd.DataFrame) -> pd.DataFrame:
    one_hot_df = one_hot_encode_amenities(df)
    return df.join(one_hot_df).drop(columns="amenities")
Exemplo n.º 40
0
    XT,
    BarycenterPredictor,
    EMDLoss,
    RndMarginalPredictor,
    Simulator,
    X,
    Y,
)

positions = DataFrame({"sensor_id": [0], "x": [10.0], "y": [10.0], "z": [10.0]})

hits = DataFrame({"event_id": [0], "x": [1.0], "y": [1.0], "z": [1.0], "energy": [1.0]})

waveforms = DataFrame({"sensor_id": [0], "event_id": [0], "charge": [20.0]})

ext_waveforms = waveforms.join(positions.set_index("sensor_id"), on="sensor_id")


class Test(unittest.TestCase):
    def test_constructors(self):
        print(XT(hits))
        print(Y(ext_waveforms))
        print(RndMarginalPredictor(hits))

    def test_simulator(self):
        sim = Simulator(positions, hits, waveforms)
        xt, y = sim.sample()
        print(xt, y)

    def test_emd_loss(self):
        loss = EMDLoss()
Exemplo n.º 41
0
class TestJoin(object):
    def setup_method(self, method):
        # aggregate multiple columns
        self.df = DataFrame({
            'key1': get_test_data(),
            'key2': get_test_data(),
            'data1': np.random.randn(N),
            'data2': np.random.randn(N)
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df['key2'] > 1]

        self.df2 = DataFrame({
            'key1':
            get_test_data(n=N // 5),
            'key2':
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            'value':
            np.random.randn(N // 5)
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            'MergedA': data['A'],
            'MergedD': data['D']
        },
                                index=data['C'])

    def test_cython_left_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.left_outer_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_(
            [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
        exp_ri = a_(
            [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_right_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        rs, ls = libjoin.left_outer_join(right, left, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        #            0        1        1        1
        exp_li = a_([
            0,
            1,
            2,
            3,
            4,
            5,
            3,
            4,
            5,
            3,
            4,
            5,
            #            2        2        4
            6,
            7,
            8,
            6,
            7,
            8,
            -1
        ])
        exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_inner_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.inner_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='left')

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='right')

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')

        joined_both = merge(self.df, self.df2, how='outer')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='outer')

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='inner')

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar'])

        assert 'key1.foo' in joined
        assert 'key1.bar' in joined

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(self.df,
                       self.df2,
                       left_on='key2',
                       right_on='key1',
                       suffixes=['.foo', '.bar'])
        assert 'key1.foo' in joined
        assert 'key2.bar' in joined

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on='C')
        tm.assert_series_equal(merged['MergedA'],
                               target['A'],
                               check_names=False)
        tm.assert_series_equal(merged['MergedD'],
                               target['D'],
                               check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
        joined = df.join(df2, on='key')
        expected = DataFrame({
            'key': ['a', 'a', 'b', 'b', 'c'],
            'value': [0, 0, 1, 1, 2]
        })
        assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=['a', 'b', 'c'],
                         columns=['one'])
        df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two'])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three'])
        joined = df_a.join(df_b, on='one')
        joined = joined.join(df_c, on='one')
        assert np.isnan(joined['two']['c'])
        assert np.isnan(joined['three']['c'])

        # merge column not p resent
        with pytest.raises(KeyError, match="^'E'$"):
            target.join(source, on='E')

        # overlap
        source_copy = source.copy()
        source_copy['A'] = 0
        msg = ("You are trying to merge on float64 and object columns. If"
               " you wish to proceed you should use pd.concat")
        with pytest.raises(ValueError, match=msg):
            target.join(source_copy, on='A')

    def test_join_on_fails_with_different_right_index(self):
        df = DataFrame({
            'a': np.random.choice(['m', 'f'], size=3),
            'b': np.random.randn(3)
        })
        df2 = DataFrame(
            {
                'a': np.random.choice(['m', 'f'], size=10),
                'b': np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2))
        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, left_on='a', right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        df = DataFrame(
            {
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            },
            index=tm.makeCustomIndex(3, 2))
        df2 = DataFrame({
            'a': np.random.choice(['m', 'f'], size=10),
            'b': np.random.randn(10)
        })
        msg = (r'len\(right_on\) must equal the number of levels in the index'
               ' of "left"')
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on='b', left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        df = DataFrame({
            'a': np.random.choice(['m', 'f'], size=3),
            'b': np.random.randn(3)
        })
        df2 = DataFrame(
            {
                'a': np.random.choice(['m', 'f'], size=10),
                'b': np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2))
        msg = r"len\(right_on\) must equal len\(left_on\)"
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on='a', left_on=['a', 'b'])

    @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])])
    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
        # GH12081 - original issue

        # GH21220 - merging of Series and DataFrame is now allowed
        # Edited test to remove the Series object from test parameters

        df = DataFrame({'a': [1, 1]})
        msg = ("Can only merge Series or DataFrame objects, a {} was passed".
               format(str(type(wrong_type))))
        with pytest.raises(TypeError, match=msg):
            merge(wrong_type, df, left_on='a', right_on='a')
        with pytest.raises(TypeError, match=msg):
            merge(df, wrong_type, left_on='a', right_on='a')

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on='C')
        del expected['C']

        join_col = self.target.pop('C')
        result = self.target.join(self.source, on=join_col)
        assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on='C')
        for col in self.source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = self.target.join(self.source.reindex([]),
                                   on='C',
                                   how='inner')
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notna()]
        tm.assert_series_equal(joined['key'],
                               expected['key'],
                               check_dtype=False)
        tm.assert_series_equal(joined['value'],
                               expected['value'],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source['MergedA'], on='C')
        expected = self.target.join(self.source[['MergedA']], on='C')
        assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({'a': [1, 1]})
        ds = Series([2], index=[1], name='b')
        result = df.join(ds, on='a')
        expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(10),
                        columns=['A', 'B', 'C', 'D'])
        assert df1['B'].dtype == np.int64
        assert df1['D'].dtype == np.bool_

        df2 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(0, 10, 2),
                        columns=['A', 'B', 'C', 'D'])

        # overlap
        joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
        expected_columns = [
            'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two',
            'D_two'
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=['A']), how='outer')

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        index2 = MultiIndex.from_arrays(
            [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self):
        key1 = [
            'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux',
            'snap'
        ]
        key2 = [
            'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three',
            'one'
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2, 'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                  [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data,
                         to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'],
                         how='inner',
                         sort=False)

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
        new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
        other_df.set_index('a', inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(UserWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        assert ('b', 'mean') in result
        assert 'b' in result

    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes['a'] == 'float64'
        assert joined.dtypes['b'] == 'float64'
        assert joined.dtypes['c'] == 'float32'

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        assert rs.dtypes['a'] == 'int64'
        assert rs.dtypes['b'] == 'float64'
        assert rs.dtypes['c'] == 'float32'
        assert rs.dtypes['md'] == 'float32'

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            'key': ['foo', 'bar', 'baz', 'foo'],
            'value': [1, 2, 3, 4]
        })
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame(
            {
                'key': ['bar', 'baz', 'foo', 'foo'],
                'value': [2, 3, 1, 4],
                'value2': ['a', 'b', 'c', 'c']
            },
            index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        tm.assert_index_equal(joined.index, pd.Index(lrange(4)))

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                'a': [1, 2, 3, 3, 4],
                'b': [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            'a': [1, 2, 3, 4],
            'b': [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range('2016-01-01', periods=16, freq='M')
        df = DataFrame([i for i in range(len(index))],
                       index=index,
                       columns=['pnum'])
        df2 = concat([df, df])
        result = df.join(df2, how='inner', rsuffix='_df2')
        expected = DataFrame(np.tile(
            np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
                             columns=['pnum', 'pnum_df2'],
                             index=df2.sort_index().index)
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
        df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how='outer')
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how='inner')
        _check_diff_index(df_list, joined, df.index[2:8])

        msg = "Joining multiple DataFrames only supported for joining on index"
        with pytest.raises(ValueError, match=msg):
            df_list[0].join(df_list[1:], on='a')

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
        df['key'] = ['foo', 'bar'] * 4
        df1 = df.loc[:, ['A', 'B']]
        df2 = df.loc[:, ['C', 'D']]
        df3 = df.loc[:, ['key']]

        result = df1.join([df2, df3])
        assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat([
            DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']),
            DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                      columns=['A', 'C'])
        ],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'
        ]
        assert_frame_equal(dta, expected)

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product(
            [list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num'])
        left = DataFrame({'v1': range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product(
            [list('abc'), list('xy')], names=['abc', 'xy'])
        right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=['abc', 'xy'], how=join_type)
        expected = (left.reset_index().merge(right.reset_index(),
                                             on=['abc', 'xy'],
                                             how=join_type).set_index(
                                                 ['abc', 'xy', 'num']))
        assert_frame_equal(expected, result)

        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            left.join(right, on='xy', how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=['abc', 'xy'], how=join_type)

    def test_join_on_tz_aware_datetimeindex(self):
        # GH 23931
        df1 = pd.DataFrame({
            'date':
            pd.date_range(start='2018-01-01', periods=5, tz='America/Chicago'),
            'vals':
            list('abcde')
        })

        df2 = pd.DataFrame({
            'date':
            pd.date_range(start='2018-01-03', periods=5, tz='America/Chicago'),
            'vals_2':
            list('tuvwx')
        })
        result = df1.join(df2.set_index('date'), on='date')
        expected = df1.copy()
        expected['vals_2'] = pd.Series([np.nan] * len(expected), dtype=object)
        assert_frame_equal(result, expected)
Exemplo n.º 42
0
class TestJoin(tm.TestCase):
    def setUp(self):
        # aggregate multiple columns
        self.df = DataFrame({
            'key1': get_test_data(),
            'key2': get_test_data(),
            'data1': np.random.randn(N),
            'data2': np.random.randn(N)
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df['key2'] > 1]

        self.df2 = DataFrame({
            'key1':
            get_test_data(n=N // 5),
            'key2':
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            'value':
            np.random.randn(N // 5)
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            'MergedA': data['A'],
            'MergedD': data['D']
        },
                                index=data['C'])

    def test_cython_left_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        ls, rs = _join.left_outer_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_(
            [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
        exp_ri = a_(
            [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_right_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        rs, ls = _join.left_outer_join(right, left, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        #            0        1        1        1
        exp_li = a_([
            0,
            1,
            2,
            3,
            4,
            5,
            3,
            4,
            5,
            3,
            4,
            5,
            #            2        2        4
            6,
            7,
            8,
            6,
            7,
            8,
            -1
        ])
        exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_inner_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
        max_group = 5

        ls, rs = _join.inner_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='left')

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='right')

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')

        joined_both = merge(self.df, self.df2, how='outer')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='outer')

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='inner')

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar'])

        self.assertIn('key1.foo', joined)
        self.assertIn('key1.bar', joined)

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(self.df,
                       self.df2,
                       left_on='key2',
                       right_on='key1',
                       suffixes=['.foo', '.bar'])
        self.assertIn('key1.foo', joined)
        self.assertIn('key2.bar', joined)

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on='C')
        self.assert_series_equal(merged['MergedA'],
                                 target['A'],
                                 check_names=False)
        self.assert_series_equal(merged['MergedD'],
                                 target['D'],
                                 check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
        joined = df.join(df2, on='key')
        expected = DataFrame({
            'key': ['a', 'a', 'b', 'b', 'c'],
            'value': [0, 0, 1, 1, 2]
        })
        assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=['a', 'b', 'c'],
                         columns=['one'])
        df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two'])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three'])
        joined = df_a.join(df_b, on='one')
        joined = joined.join(df_c, on='one')
        self.assertTrue(np.isnan(joined['two']['c']))
        self.assertTrue(np.isnan(joined['three']['c']))

        # merge column not p resent
        self.assertRaises(KeyError, target.join, source, on='E')

        # overlap
        source_copy = source.copy()
        source_copy['A'] = 0
        self.assertRaises(ValueError, target.join, source_copy, on='A')

    def test_join_on_fails_with_different_right_index(self):
        with tm.assertRaises(ValueError):
            df = DataFrame({
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            })
            df2 = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=10),
                    'b': np.random.randn(10)
                },
                index=tm.makeCustomIndex(10, 2))
            merge(df, df2, left_on='a', right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        with tm.assertRaises(ValueError):
            df = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=3),
                    'b': np.random.randn(3)
                },
                index=tm.makeCustomIndex(10, 2))
            df2 = DataFrame({
                'a': np.random.choice(['m', 'f'], size=10),
                'b': np.random.randn(10)
            })
            merge(df, df2, right_on='b', left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        with tm.assertRaises(ValueError):
            df = DataFrame({
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            })
            df2 = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=10),
                    'b': np.random.randn(10)
                },
                index=tm.makeCustomIndex(10, 2))
            merge(df, df2, right_on='a', left_on=['a', 'b'])

    def test_join_on_fails_with_wrong_object_type(self):
        # GH12081
        wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])]
        df = DataFrame({'a': [1, 1]})

        for obj in wrongly_typed:
            with tm.assertRaisesRegexp(ValueError, str(type(obj))):
                merge(obj, df, left_on='a', right_on='a')
            with tm.assertRaisesRegexp(ValueError, str(type(obj))):
                merge(df, obj, left_on='a', right_on='a')

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on='C')
        del expected['C']

        join_col = self.target.pop('C')
        result = self.target.join(self.source, on=join_col)
        assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on='C')
        for col in self.source:
            self.assertIn(col, merged)
            self.assertTrue(merged[col].isnull().all())

        merged2 = self.target.join(self.source.reindex([]),
                                   on='C',
                                   how='inner')
        self.assert_index_equal(merged2.columns, merged.columns)
        self.assertEqual(len(merged2), 0)

    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notnull()]
        self.assert_series_equal(joined['key'],
                                 expected['key'],
                                 check_dtype=False)
        self.assert_series_equal(joined['value'],
                                 expected['value'],
                                 check_dtype=False)
        self.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source['MergedA'], on='C')
        expected = self.target.join(self.source[['MergedA']], on='C')
        assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({'a': [1, 1]})
        ds = Series([2], index=[1], name='b')
        result = df.join(ds, on='a')
        expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self):
        df1 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(10),
                        columns=['A', 'B', 'C', 'D'])
        self.assertEqual(df1['B'].dtype, np.int64)
        self.assertEqual(df1['D'].dtype, np.bool_)

        df2 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(0, 10, 2),
                        columns=['A', 'B', 'C', 'D'])

        # overlap
        joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
        expected_columns = [
            'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two',
            'D_two'
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        assert_frame_equal(joined, expected)

        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        for kind in ['inner', 'outer', 'left', 'right']:

            joined = df1.join(df2, how=kind)
            expected = _join_by_hand(df1, df2, how=kind)
            assert_frame_equal(joined, expected)

            joined = df2.join(df1, how=kind)
            expected = _join_by_hand(df2, df1, how=kind)
            assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=['A']), how='outer')

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        index2 = MultiIndex.from_arrays(
            [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = index1._tuple_index.union(index2._tuple_index)
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        self.assertEqual(joined.index.names, index1.names)

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = index1._tuple_index.union(index2._tuple_index)
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        self.assertEqual(joined.index.names, index1.names)

    def test_join_inner_multiindex(self):
        key1 = [
            'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux',
            'snap'
        ]
        key2 = [
            'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three',
            'one'
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2, 'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data,
                         to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'],
                         how='inner',
                         sort=False)

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        self.assertTrue(joined.index.is_monotonic)
        assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
        new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
        other_df.set_index('a', inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(UserWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        self.assertTrue(('b', 'mean') in result)
        self.assertTrue('b' in result)

    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        self.assertEqual(joined.dtypes['a'], 'float64')
        self.assertEqual(joined.dtypes['b'], 'float64')
        self.assertEqual(joined.dtypes['c'], 'float32')

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        self.assertEqual(rs.dtypes['a'], 'int64')
        self.assertEqual(rs.dtypes['b'], 'float64')
        self.assertEqual(rs.dtypes['c'], 'float32')
        self.assertEqual(rs.dtypes['md'], 'float32')

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            'key': ['foo', 'bar', 'baz', 'foo'],
            'value': [1, 2, 3, 4]
        })
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame(
            {
                'key': ['bar', 'baz', 'foo', 'foo'],
                'value': [2, 3, 1, 4],
                'value2': ['a', 'b', 'c', 'c']
            },
            index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        self.assert_index_equal(joined.index, pd.Index(lrange(4)))

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                'a': [1, 2, 3, 3, 4],
                'b': [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            'a': [1, 2, 3, 4],
            'b': [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
        df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how='outer')
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how='inner')
        _check_diff_index(df_list, joined, df.index[2:8])

        self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a')

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
        df['key'] = ['foo', 'bar'] * 4
        df1 = df.loc[:, ['A', 'B']]
        df2 = df.loc[:, ['C', 'D']]
        df3 = df.loc[:, ['key']]

        result = df1.join([df2, df3])
        assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat([
            DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']),
            DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                      columns=['A', 'C'])
        ],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'
        ]
        assert_frame_equal(dta, expected)

    def test_panel_join(self):
        panel = tm.makePanel()
        tm.add_nans(panel)

        p1 = panel.iloc[:2, :10, :3]
        p2 = panel.iloc[2:, 5:, 2:]

        # left join
        result = p1.join(p2)
        expected = p1.copy()
        expected['ItemC'] = p2['ItemC']
        tm.assert_panel_equal(result, expected)

        # right join
        result = p1.join(p2, how='right')
        expected = p2.copy()
        expected['ItemA'] = p1['ItemA']
        expected['ItemB'] = p1['ItemB']
        expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
        tm.assert_panel_equal(result, expected)

        # inner join
        result = p1.join(p2, how='inner')
        expected = panel.iloc[:, 5:10, 2:3]
        tm.assert_panel_equal(result, expected)

        # outer join
        result = p1.join(p2, how='outer')
        expected = p1.reindex(major=panel.major_axis, minor=panel.minor_axis)
        expected = expected.join(
            p2.reindex(major=panel.major_axis, minor=panel.minor_axis))
        tm.assert_panel_equal(result, expected)

    def test_panel_join_overlap(self):
        panel = tm.makePanel()
        tm.add_nans(panel)

        p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']]
        p2 = panel.loc[['ItemB', 'ItemC']]

        # Expected index is
        #
        # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
        joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
        p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1')
        p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2')
        no_overlap = panel.loc[['ItemA']]
        expected = no_overlap.join(p1_suf.join(p2_suf))
        tm.assert_panel_equal(joined, expected)

    def test_panel_join_many(self):
        tm.K = 10
        panel = tm.makePanel()
        tm.K = 4

        panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]]

        joined = panels[0].join(panels[1:])
        tm.assert_panel_equal(joined, panel)

        panels = [
            panel.iloc[:2, :-5], panel.iloc[2:6, 2:], panel.iloc[6:, 5:-7]
        ]

        data_dict = {}
        for p in panels:
            data_dict.update(p.iteritems())

        joined = panels[0].join(panels[1:], how='inner')
        expected = pd.Panel.from_dict(data_dict, intersect=True)
        tm.assert_panel_equal(joined, expected)

        joined = panels[0].join(panels[1:], how='outer')
        expected = pd.Panel.from_dict(data_dict, intersect=False)
        tm.assert_panel_equal(joined, expected)

        # edge cases
        self.assertRaises(ValueError,
                          panels[0].join,
                          panels[1:],
                          how='outer',
                          lsuffix='foo',
                          rsuffix='bar')
        self.assertRaises(ValueError, panels[0].join, panels[1:], how='right')
Exemplo n.º 43
0
runtime_yf.reset_index(inplace=True)
runtime_yf = runtime_yf.rename(columns = {'index':'number of stocks'})
runtime_yf['number of stocks'] += 1


#runtimes using csv files

#For this project, we assume that the data is in
#the same directory as the .py file.

results = []
for i in djia:
    j = djia.index(i)
    startTime = perf_counter()
    filename = "data/"+i + ".csv"
    df = pd.read_csv(filename, encoding='utf-8')
    endTime = perf_counter()
    csv = (endTime - startTime)
    if j > 1:
        csv = csv + results[(j - 1)]
    results.append(csv)

runtime_csv = DataFrame(results, columns=['runtime'])
runtime_csv.reset_index(inplace=True)
runtime_csv = runtime_csv.rename(columns = {'index':'number of stocks'})
runtime_csv['number of stocks'] += 1

runtimes = []
runtimes = runtime_yf.join(runtime_csv, lsuffix='_yf', rsuffix='_csv')
runtimes = runtimes.rename(columns ={'number of stocks_yf':'number of stocks'})
runtimes.drop(columns=['number of stocks_csv'])
Exemplo n.º 44
0
invalid_times = ['09:31:00', '09:32:00', '09:33:00', '09:34:00']

for i in range(len(gdata)):
    if str(gdata.index[i])[-8:] in invalid_times:
        print "Dropping row at index " + str(
            gdata.index[i]) + ' at ' + time.ctime()
        gdata.drop(gdata.index[i], inplace=True)
'''
Index and join generated image data to clean financial data
===========================================================

After getting correct DTI in place, inner join the two DFs on the index
'''

test = clean.join(gdata, how='inner')
'''
Generate target data for model training
=======================================

NB: targets being generated from forward data means we will lose a few
train / test examples on the near-term end of the time series
'''

# stupidly simple binary loop; flexible to whatever is specified in mins_ahead:
ahead = []

for i in range(len(clean) - max(mins_ahead)):
    current_row = [
        1 if clean.iloc[i + mins_ahead[j], 0] > clean.iloc[i, 0] else 0
        for j in range(len(mins_ahead))
Exemplo n.º 45
0
class TestJoin:
    def setup_method(self, method):
        # aggregate multiple columns
        self.df = DataFrame({
            "key1": get_test_data(),
            "key2": get_test_data(),
            "data1": np.random.randn(N),
            "data2": np.random.randn(N),
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df["key2"] > 1]

        self.df2 = DataFrame({
            "key1":
            get_test_data(n=N // 5),
            "key2":
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            "value":
            np.random.randn(N // 5),
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            "MergedA": data["A"],
            "MergedD": data["D"]
        },
                                index=data["C"])

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="left")

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="left")

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="right")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="right")

        joined_both = merge(self.df, self.df2, how="right")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="right")

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="outer")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer")

        joined_both = merge(self.df, self.df2, how="outer")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="outer")

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="inner")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner")

        joined_both = merge(self.df, self.df2, how="inner")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="inner")

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar"))

        assert "key1.foo" in joined
        assert "key1.bar" in joined

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(
            self.df,
            self.df2,
            left_on="key2",
            right_on="key1",
            suffixes=(".foo", ".bar"),
        )
        assert "key1.foo" in joined
        assert "key2.bar" in joined

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on="C")
        tm.assert_series_equal(merged["MergedA"],
                               target["A"],
                               check_names=False)
        tm.assert_series_equal(merged["MergedD"],
                               target["D"],
                               check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
        joined = df.join(df2, on="key")
        expected = DataFrame({
            "key": ["a", "a", "b", "b", "c"],
            "value": [0, 0, 1, 1, 2]
        })
        tm.assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=["a", "b", "c"],
                         columns=["one"])
        df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
        joined = df_a.join(df_b, on="one")
        joined = joined.join(df_c, on="one")
        assert np.isnan(joined["two"]["c"])
        assert np.isnan(joined["three"]["c"])

        # merge column not p resent
        with pytest.raises(KeyError, match="^'E'$"):
            target.join(source, on="E")

        # overlap
        source_copy = source.copy()
        source_copy["A"] = 0
        msg = ("You are trying to merge on float64 and object columns. If "
               "you wish to proceed you should use pd.concat")
        with pytest.raises(ValueError, match=msg):
            target.join(source_copy, on="A")

    def test_join_on_fails_with_different_right_index(self):
        df = DataFrame({
            "a": np.random.choice(["m", "f"], size=3),
            "b": np.random.randn(3)
        })
        df2 = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=10),
                "b": np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2),
        )
        msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, left_on="a", right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        df = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=3),
                "b": np.random.randn(3)
            },
            index=tm.makeCustomIndex(3, 2),
        )
        df2 = DataFrame({
            "a": np.random.choice(["m", "f"], size=10),
            "b": np.random.randn(10)
        })
        msg = r'len\(right_on\) must equal the number of levels in the index of "left"'
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on="b", left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        df = DataFrame({
            "a": np.random.choice(["m", "f"], size=3),
            "b": np.random.randn(3)
        })
        df2 = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=10),
                "b": np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2),
        )
        msg = r"len\(right_on\) must equal len\(left_on\)"
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on="a", left_on=["a", "b"])

    @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
        # GH12081 - original issue

        # GH21220 - merging of Series and DataFrame is now allowed
        # Edited test to remove the Series object from test parameters

        df = DataFrame({"a": [1, 1]})
        msg = ("Can only merge Series or DataFrame objects, "
               f"a {type(wrong_type)} was passed")
        with pytest.raises(TypeError, match=msg):
            merge(wrong_type, df, left_on="a", right_on="a")
        with pytest.raises(TypeError, match=msg):
            merge(df, wrong_type, left_on="a", right_on="a")

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on="C")
        del expected["C"]

        join_col = self.target.pop("C")
        result = self.target.join(self.source, on=join_col)
        tm.assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on="C")
        for col in self.source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = self.target.join(self.source.reindex([]),
                                   on="C",
                                   how="inner")
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])

        joined = df.join(df2, on="key", how="inner")

        expected = df.join(df2, on="key")
        expected = expected[expected["value"].notna()]
        tm.assert_series_equal(joined["key"], expected["key"])
        tm.assert_series_equal(joined["value"],
                               expected["value"],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])

        # corner cases
        joined = df.join(df2, on=["key"])
        expected = df.join(df2, on="key")

        tm.assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source["MergedA"], on="C")
        expected = self.target.join(self.source[["MergedA"]], on="C")
        tm.assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({"a": [1, 1]})
        ds = Series([2], index=[1], name="b")
        result = df.join(ds, on="a")
        expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1["bool"] = True
        df1["string"] = "foo"

        df2 = DataFrame(index=np.arange(5, 15))
        df2["int"] = 1
        df2["float"] = 1.0

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        tm.assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        tm.assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame(
            {
                "A": 1.0,
                "B": 2,
                "C": "foo",
                "D": True
            },
            index=np.arange(10),
            columns=["A", "B", "C", "D"],
        )
        assert df1["B"].dtype == np.int64
        assert df1["D"].dtype == np.bool_

        df2 = DataFrame(
            {
                "A": 1.0,
                "B": 2,
                "C": "foo",
                "D": True
            },
            index=np.arange(0, 10, 2),
            columns=["A", "B", "C", "D"],
        )

        # overlap
        joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
        expected_columns = [
            "A_one",
            "B_one",
            "C_one",
            "D_one",
            "A_two",
            "B_two",
            "C_two",
            "D_two",
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        tm.assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(np.random.randn(30, 2), columns=["a", "b"])
        c = Series(np.random.randn(30))
        a["c"] = c
        d = DataFrame(np.random.randn(30, 1), columns=["q"])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
            names=["first", "second"],
        )

        index2 = MultiIndex.from_arrays(
            [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
            names=["first", "second"],
        )

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=["var X"])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=["var Y"])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how="outer")
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        tm.assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how="outer").sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        tm.assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self):
        key1 = [
            "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux",
            "snap"
        ]
        key2 = [
            "two",
            "one",
            "three",
            "one",
            "two",
            "one",
            "two",
            "two",
            "three",
            "one",
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({"key1": key1, "key2": key2, "data": data})

        index = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=["j_one", "j_two", "j_three"])

        joined = data.join(to_join, on=["key1", "key2"], how="inner")
        expected = merge(
            data,
            to_join.reset_index(),
            left_on=["key1", "key2"],
            right_on=["first", "second"],
            how="inner",
            sort=False,
        )

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )
        tm.assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )

        expected = expected.drop(["first", "second"], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        tm.assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
        new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
        other_df.set_index("a", inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(FutureWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        assert ("b", "mean") in result
        assert "b" in result

    def test_join_float64_float32(self):

        a = DataFrame(np.random.randn(10, 2),
                      columns=["a", "b"],
                      dtype=np.float64)
        b = DataFrame(np.random.randn(10, 1), columns=["c"], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes["a"] == "float64"
        assert joined.dtypes["b"] == "float64"
        assert joined.dtypes["c"] == "float32"

        a = np.random.randint(0, 5, 100).astype("int64")
        b = np.random.random(100).astype("float64")
        c = np.random.random(100).astype("float32")
        df = DataFrame({"a": a, "b": b, "c": c})
        xpdf = DataFrame({"a": a, "b": b, "c": c})
        s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
        rs = df.merge(s, left_on="a", right_index=True)
        assert rs.dtypes["a"] == "int64"
        assert rs.dtypes["b"] == "float64"
        assert rs.dtypes["c"] == "float32"
        assert rs.dtypes["md"] == "float32"

        xp = xpdf.merge(s, left_on="a", right_index=True)
        tm.assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how="outer")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")

        result = result.reset_index()
        expected = expected[result.columns]
        expected["a"] = expected.a.astype("int64")
        expected["b"] = expected.b.astype("int64")
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how="inner")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")

        result = result.reset_index()

        tm.assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
            "C":
            np.random.randn(8),
            "D":
            np.random.randn(8),
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name="TEST")
        inner = df.join(s, how="inner")
        outer = df.join(s, how="outer")
        left = df.join(s, how="left")
        right = df.join(s, how="right")
        tm.assert_frame_equal(inner, outer)
        tm.assert_frame_equal(inner, left)
        tm.assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            "key": ["foo", "bar", "baz", "foo"],
            "value": [1, 2, 3, 4]
        })
        right = DataFrame({"value2": ["a", "b", "c"]},
                          index=["bar", "baz", "foo"])

        joined = left.join(right, on="key", sort=True)
        expected = DataFrame(
            {
                "key": ["bar", "baz", "foo", "foo"],
                "value": [2, 3, 1, 4],
                "value2": ["a", "b", "c", "c"],
            },
            index=[1, 2, 0, 3],
        )
        tm.assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on="key", sort=False)
        tm.assert_index_equal(joined.index, Index(range(4)), exact=True)

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
        df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                "a": [1, 2, 3, 3, 4],
                "b": [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, "a"],
        )
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
        df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            "a": [1, 2, 3, 4],
            "b": [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, "a"])
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range("2016-01-01", periods=16, freq="M")
        df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
        df2 = concat([df, df])
        result = df.join(df2, how="inner", rsuffix="_df2")
        expected = DataFrame(
            np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
            columns=["pnum", "pnum_df2"],
            index=df2.sort_index().index,
        )
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=["a", "b", "c", "d", "e", "f"])
        df.insert(0, "id", 0)
        df.insert(5, "dt", "foo")

        grouped = df.groupby("id")
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix="_right")

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list("abcdef"))
        df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how="outer")
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how="inner")
        _check_diff_index(df_list, joined, df.index[2:8])

        msg = "Joining multiple DataFrames only supported for joining on index"
        with pytest.raises(ValueError, match=msg):
            df_list[0].join(df_list[1:], on="a")

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"])
        df["key"] = ["foo", "bar"] * 4
        df1 = df.loc[:, ["A", "B"]]
        df2 = df.loc[:, ["C", "D"]]
        df3 = df.loc[:, ["key"]]

        result = df1.join([df2, df3])
        tm.assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat(
            [
                DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"
                                                           ]),
                DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                          columns=["A", "C"]),
            ],
            axis=1,
        )

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix="_2")
        result.columns = expected.columns
        tm.assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            "x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"
        ]
        tm.assert_frame_equal(dta, expected)

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product(
            [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"])
        left = DataFrame({"v1": range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product(
            [list("abc"), list("xy")], names=["abc", "xy"])
        right = DataFrame({"v2": [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=["abc", "xy"], how=join_type)
        expected = (left.reset_index().merge(right.reset_index(),
                                             on=["abc", "xy"],
                                             how=join_type).set_index(
                                                 ["abc", "xy", "num"]))
        tm.assert_frame_equal(expected, result)

        msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
        with pytest.raises(ValueError, match=msg):
            left.join(right, on="xy", how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=["abc", "xy"], how=join_type)

    def test_join_on_tz_aware_datetimeindex(self):
        # GH 23931, 26335
        df1 = DataFrame({
            "date":
            pd.date_range(start="2018-01-01", periods=5, tz="America/Chicago"),
            "vals":
            list("abcde"),
        })

        df2 = DataFrame({
            "date":
            pd.date_range(start="2018-01-03", periods=5, tz="America/Chicago"),
            "vals_2":
            list("tuvwx"),
        })
        result = df1.join(df2.set_index("date"), on="date")
        expected = df1.copy()
        expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object)
        tm.assert_frame_equal(result, expected)

    def test_join_datetime_string(self):
        # GH 5647
        dfa = DataFrame(
            [
                ["2012-08-02", "L", 10],
                ["2012-08-02", "J", 15],
                ["2013-04-06", "L", 20],
                ["2013-04-06", "J", 25],
            ],
            columns=["x", "y", "a"],
        )
        dfa["x"] = pd.to_datetime(dfa["x"])
        dfb = DataFrame(
            [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]],
            columns=["x", "y", "z"],
            index=[2, 4],
        )
        dfb["x"] = pd.to_datetime(dfb["x"])
        result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"])
        expected = DataFrame(
            [
                [Timestamp("2012-08-02 00:00:00"), "J", 1, 15],
                [Timestamp("2013-04-06 00:00:00"), "L", 2, 20],
            ],
            index=[2, 4],
            columns=["x", "y", "z", "a"],
        )
        tm.assert_frame_equal(result, expected)
df = read_csv("Phoneix_Finalclean.csv")
aa=df.Conditions_Name.value_counts()
ax=aa.plot(x='Conditions_Name', y='Amount',kind='bar',color="blue", figsize=(15,8),fontsize=16)
#plt.title("Twelve years' weather condition summary",size=30)
#ax.set_title("2004-2016 Phoenix weather condition summary",size=30)
ax.set_xlabel('Weather Condition',size=20) 
ax.set_ylabel('Total Amount/hour',size=20)                    
plt.show()


scaler = MinMaxScaler(feature_range=(0, 1))
scaled_d =df.loc[:,["Sea_Level_PressureIn_N","Humidity_N","Dew_PointF_N","Wind_Speed_mps","Temperature_C_N"]]
scaled = scaler.fit_transform(scaled_d)
scaled = DataFrame(scaled)
scaled.columns = ["Sea_Level_PressureIn_N","Humidity_N","Dew_PointF_N","Wind_Speed_mps","Temperature_C_N"]
x = scaled.join(df.loc[:,["Hour","Conditions_Name"]])
x = x.loc[x['Conditions_Name'].isin(["Clear","Mostly Cloudy","Partly Cloudy","Scattered Clouds",'Overcast'])]
x = x.dropna()
x.isnull().sum()
count = x.Conditions_Name.value_counts()
print(count)

def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data
x = encode(x, 'Hour', 23)
x=x.drop(["Hour"],axis=1)
ax = x.plot.scatter('Hour_sin', 'Hour_cos').set_aspect('equal')
"""
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
Exemplo n.º 47
0
def train_and_eval_classifier(dataframe: pd.DataFrame,
                              label_name: str,
                              train_fraction: float,
                              model_name: str,
                              seed: int,
                              verbose: int = 1,
                              n_jobs: int = 4) -> Tuple[pd.DataFrame, Dict]:
    """
    Train and evaluate the classifier given dataset as a dataframe.
    The dataset is a design matrix, where in rows each new observations
    are placed, and the columns denote the explanatory variables.
    The process of finding the best parameters is done by leave one
    out cross validation method, utilizing the accuracy score.

    :param dataframe: Data collected for the classification problem.
    :param label_name: Name of the label i.e., the dependent variable.
    :param train_fraction: Fraction of samples for
        each class for stratified sampling.
    :param model_name: Name of the utilized model.
    :param seed: Seed used for reproduction of the experiment results.
    :param verbose: Verbosity mode.
    :param n_jobs: Number of jobs utilized for the parallel computing.
    :return: Tuple of the report over the test set as a dataframe
        and the best parameters found as a dictionary.
    """
    dataframe = dataframe.join(
        pd.get_dummies(dataframe[label_name], prefix='class'))
    class_names = [col_name for col_name in dataframe
                   if col_name.startswith('class')]

    train = dataframe.groupby('label', group_keys=False).apply(
        lambda class_group: class_group.sample(
            n=ceil(train_fraction * len(class_group)),
            random_state=seed)).drop(columns=label_name)

    test = dataframe.drop(train.index).drop(columns=label_name)

    X_train, y_train, X_test, y_test = \
        train.drop(columns=class_names), train[class_names], \
        test.drop(columns=class_names), test[class_names]

    model = GridSearchCV(
        estimator=ML_MODELS[model_name](random_state=seed),
        param_grid=ML_MODELS_GRID[model_name],
        cv=LeaveOneOut().split(X_train, y_train),
        scoring=make_scorer(accuracy_score),
        verbose=verbose,
        n_jobs=n_jobs,
        refit=True).fit(X_train, y_train)

    y_test_pred = model.predict(X_test)

    y_test_true_argmax = y_test.values.argmax(axis=1)
    y_test_pred_argmax = y_test_pred.argmax(axis=1)
    class_names = {class_name: i for i, class_name in enumerate(list(y_test))}

    test_report = pd.DataFrame(
        confusion_matrix(y_true=y_test_true_argmax,
                         y_pred=y_test_pred_argmax,
                         labels=list(class_names.values())),
        index=['true_' + class_name for class_name in class_names.keys()],
        columns=['pred_' + class_name for class_name in class_names.keys()])

    placeholder = [None for _ in range(len(class_names) - 1)]
    test_report['test_oa_acc'] = [accuracy_score(
        y_true=y_test_true_argmax, y_pred=y_test_pred_argmax)] + placeholder
    test_report['test_avg_acc'] = [balanced_accuracy_score(
        y_true=y_test_true_argmax, y_pred=y_test_pred_argmax)] + placeholder
    return test_report, model.best_params_
Exemplo n.º 48
0
    def filtered_summaries(
        self,
        start_time,
        end_time,
        interval,
        filter_expression,
        summary_types,
        calculation_basis=None,
        filter_evaluation=None,
        filter_interval=None,
        time_type=None,
    ):
        """filtered_summaries

        Return one or more summary values for each interval within a time range

        Args:
            start_time (str): String containing the date, and possibly time,
                from which to retrieve the values. This is parsed, together
                with `end_time`, using
                :afsdk:`AF.Time.AFTimeRange <M_OSIsoft_AF_Time_AFTimeRange__ctor_1.htm>`.
            end_time (str): String containing the date, and possibly time,
                until which to retrieve values. This is parsed, together
                with `start_time`, using
                :afsdk:`AF.Time.AFTimeRange <M_OSIsoft_AF_Time_AFTimeRange__ctor_1.htm>`.
            interval (str): String containing the interval at which to extract
                data. This is parsed using
                :afsdk:`AF.Time.AFTimeSpan.Parse <M_OSIsoft_AF_Time_AFTimeSpan_Parse_1.htm>`.
            filter_expression (str, optional): Defaults to ''. Query on which
                data to include in the results. See :ref:`filtering_values`
                for more information on filter queries.
            summary_types (int or PIConsts.SummaryType): Type(s) of summaries
                of the data within the requested time range.
            calculation_basis (int or PIConsts.CalculationBasis, optional):
                Event weighting within an interval. See :ref:`event_weighting`
                and :any:`CalculationBasis` for more information. Defaults to
                CalculationBasis.TIME_WEIGHTED.
            filter_evaluation (int or PIConsts,ExpressionSampleType, optional):
                Determines whether the filter is applied to the raw events in
                the database, of if it is applied to an interpolated series
                with a regular interval. Defaults to
                ExpressionSampleType.EXPRESSION_RECORDED_VALUES.
            filter_interval (str, optional): String containing the interval at
                which to extract apply the filter. This is parsed using
                :afsdk:`AF.Time.AFTimeSpan.Parse <M_OSIsoft_AF_Time_AFTimeSpan_Parse_1.htm>`.
            time_type (int or PIConsts.TimestampCalculation, optional):
                Timestamp to return for each of the requested summaries. See
                :ref:`summary_timestamps` and :any:`TimestampCalculation` for
                more information. Defaults to TimestampCalculation.AUTO.

        Returns:
            pandas.DataFrame: Dataframe with the unique timestamps as row index
                and the summary name as column name.
        """
        time_range = AF.Time.AFTimeRange(start_time, end_time)
        interval = AF.Time.AFTimeSpan.Parse(interval)
        filter_expression = self._normalize_filter_expression(
            filter_expression)
        calculation_basis = get_enumerated_value(
            enumeration=CalculationBasis,
            value=calculation_basis,
            default=CalculationBasis.TIME_WEIGHTED,
        )
        filter_evaluation = get_enumerated_value(
            enumeration=ExpressionSampleType,
            value=filter_evaluation,
            default=ExpressionSampleType.EXPRESSION_RECORDED_VALUES,
        )
        time_type = get_enumerated_value(
            enumeration=TimestampCalculation,
            value=time_type,
            default=TimestampCalculation.AUTO,
        )
        filter_interval = AF.Time.AFTimeSpan.Parse(filter_interval)
        pivalues = self._filtered_summaries(
            time_range,
            interval,
            filter_expression,
            summary_types,
            calculation_basis,
            filter_evaluation,
            filter_interval,
            time_type,
        )
        df = DataFrame()
        for summary in pivalues:
            key = SummaryType(summary.Key).name
            timestamps, values = zip(
                *[(PISeries.timestamp_to_index(value.Timestamp.UtcTime),
                   value.Value) for value in summary.Value])
            df = df.join(DataFrame(data={key: values}, index=timestamps),
                         how="outer")
        return df
Exemplo n.º 49
0
i = [t in dfs1[2].时间.values for t in dfs1[0].时间.values]
dfs1[0] = dfs1[0][i]
dfs1[1] = dfs1[1][i]

i = [t in dfs1[0].时间.values for t in dfs1[2].时间.values]
for j in range(2, 15):
    dfs1[j] = dfs1[j][i]

for j in range(len(dfs1)):
    dfs1[j] = dfs1[j].set_index("时间")

#dfs1[0] = dfs1[0][t in dfs1[2].时间.values for t in dfs1[0].时间.values]
tr_data = dfs1[0].iloc[:, 0].apply(float)
tr_data = DataFrame(tr_data)
for i in range(1, len(dfs1)):
    tr_data = tr_data.join(dfs1[i].iloc[:, 0].apply(float))

tr_data1 = dfs1[0].iloc[:, 1].apply(float)
tr_data1 = DataFrame(tr_data1)
for i in range(1, len(dfs1)):
    tr_data1 = tr_data1.join(dfs1[i].iloc[:, 1].apply(float))

tr_data2 = dfs1[0].iloc[:, 2].apply(float)
tr_data2 = DataFrame(tr_data2)
for i in range(1, len(dfs1)):
    tr_data2 = tr_data2.join(dfs1[i].iloc[:, 2].apply(float))

corMat = DataFrame(tr_data2.corr())
plot.pcolor(corMat)
plot.show()
Exemplo n.º 50
0
    def get_forward_data(self, months, call=True, put=False):
        """
        Gets either call, put, or both data for months starting in the current
        month and going out in the future a spcified amount of time.

        Parameters
        ----------
        months: number, int
            How many months to go out in the collection of the data. This is
            inclusive.

        call: bool, optional (default=True)
            Whether or not to collect data for call options

        put: bool, optional (default=False)
            Whether or not to collect data for put options.

        Returns
        -------
        all_calls: DataFrame
            If asked for, a DataFrame containing call data from the current
            month to the current month plus months.

        all_puts: DataFrame
            If asked for, a DataFrame containing put data from the current
            month to the current month plus months.
        """
        in_months = range(cur_month, cur_month + months + 1)
        in_years = [cur_year] * months

        # Figure out how many items in in_months go past 12
        to_change = 0
        for i in range(months):
            if in_months[i] > 12:
                in_months[i] -= 12
                to_change += 1

        # Change the corresponding items in the in_years list.
        for i in range(1, to_change + 1):
            in_years[-i] += 1

        if call:
            all_calls = DataFrame()
            for mon in range(months):
                try:  # This catches cases when there isn't data for a month
                    call_frame = self.get_call_data(in_months[mon],
                                                    in_years[mon])
                    tick = str(call_frame.ix[0, 1])
                    start = len(self.symbol)
                    year = tick[start:start + 2]
                    month = tick[start + 2:start + 4]
                    day = tick[start + 4:start + 6]
                    expiry = str(month + '-' + day + '-' + year)
                    call_frame['Expiry'] = expiry
                    if mon == 0:
                        all_calls = all_calls.join(call_frame, how='right')
                    else:
                        all_calls = concat([all_calls, call_frame])
                except:
                    pass

        if put:
            all_puts = DataFrame()
            for mon in range(months):
                try:  # This catches cases when there isn't data for a month
                    put_frame = self.get_put_data(in_months[mon],
                                                  in_years[mon])

                    # Add column with expiry data to this frame.
                    tick = str(put_frame.ix[0, 1])
                    start = len(self.symbol)
                    year = tick[start:start + 2]
                    month = tick[start + 2:start + 4]
                    day = tick[start + 4:start + 6]
                    expiry = str(month + '-' + day + '-' + year)
                    put_frame['Expiry'] = expiry

                    if mon == 0:
                        all_puts = all_puts.join(put_frame, how='right')
                    else:
                        all_puts = concat([all_puts, put_frame])
                except:
                    pass

        if call and put:
            return [all_calls, all_puts]
        else:
            if call:
                return all_calls
            else:
                return all_puts
Exemplo n.º 51
0
    def _read_one_data(self, url, params):
        """ read one data from specified symbol """

        symbol = params["symbol"]
        del params["symbol"]
        url = url.format(symbol)

        resp = self._get_response(url, params=params)
        ptrn = r"root\.App\.main = (.*?);\n}\(this\)\);"
        try:
            j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1))
            data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"]
        except KeyError:
            msg = "No data fetched for symbol {} using {}"
            raise RemoteDataError(msg.format(symbol, self.__class__.__name__))

        # price data
        prices = DataFrame(data["prices"])
        prices.columns = [col.capitalize() for col in prices.columns]
        prices["Date"] = to_datetime(
            to_datetime(prices["Date"], unit="s").dt.date)

        if "Data" in prices.columns:
            prices = prices[prices["Data"].isnull()]
        prices = prices[[
            "Date", "High", "Low", "Open", "Close", "Volume", "Adjclose"
        ]]
        prices = prices.rename(columns={"Adjclose": "Adj Close"})

        prices = prices.set_index("Date")
        prices = prices.sort_index().dropna(how="all")

        if self.ret_index:
            prices["Ret_Index"] = _calc_return_index(prices["Adj Close"])
        if self.adjust_price:
            prices = _adjust_prices(prices)

        # dividends & splits data
        if self.get_actions and data["eventsData"]:

            actions = DataFrame(data["eventsData"])
            actions.columns = [col.capitalize() for col in actions.columns]
            actions["Date"] = to_datetime(
                to_datetime(actions["Date"], unit="s").dt.date)

            types = actions["Type"].unique()
            if "DIVIDEND" in types:
                divs = actions[actions.Type == "DIVIDEND"].copy()
                divs = divs[["Date", "Amount"]].reset_index(drop=True)
                divs = divs.set_index("Date")
                divs = divs.rename(columns={"Amount": "Dividends"})
                prices = prices.join(divs, how="outer")

            if "SPLIT" in types:

                def split_ratio(row):
                    if float(row["Numerator"]) > 0:
                        if ":" in row["Splitratio"]:
                            n, m = row["Splitratio"].split(':')
                            return float(m) / float(n)
                        else:
                            return eval(row["Splitratio"])
                    else:
                        return 1

                splits = actions[actions.Type == "SPLIT"].copy()
                splits["SplitRatio"] = splits.apply(split_ratio, axis=1)
                splits = splits.reset_index(drop=True)
                splits = splits.set_index("Date")
                splits["Splits"] = splits["SplitRatio"]
                prices = prices.join(splits["Splits"], how="outer")

                if "DIVIDEND" in types and not self.adjust_dividends:
                    # dividends are adjusted automatically by Yahoo
                    adj = (prices["Splits"].sort_index(
                        ascending=False).fillna(1).cumprod())
                    prices["Dividends"] = prices["Dividends"] / adj

        return prices
    def data_encoding(self, raw_data: pd.DataFrame, building_num: int,
                      gates_code_table: dict) -> np.array:
        """
        Encode raw record data from database

        :param gates_code_table: gate code table for mapping code
        :param building_num: total buildings
        :param raw_data: raw data from DataTable.get_raw_record_data()

        :return:
            - data_list : Feature of encode data
            - target_list : gate label of encode data
        """
        week_data = raw_data['datetime'].dt.weekday.rename('week')
        raw_data = raw_data.join(week_data)
        raw_data = raw_data.reset_index().drop(columns=['index'])
        data_list = pd.DataFrame()
        #############################
        # Feature Encoding          #
        #############################
        # gate one hot encoding
        gate_one_hot_list = np.arange(len(gates_code_table)).reshape(-1, 1)
        gate_encoder = OneHotEncoder()
        gate_encoder.fit(gate_one_hot_list)

        week_one_hot_list = np.arange(7).reshape(-1, 1)
        week_encoder = OneHotEncoder()
        week_encoder.fit(week_one_hot_list)

        building_one_hot_list = np.arange(1, building_num + 1).reshape(-1, 1)
        building_encoder = OneHotEncoder()
        building_encoder.fit(building_one_hot_list)

        gatecode = raw_data['building'].str.cat([raw_data['floor'], raw_data['IO']], sep='-').apply(
            lambda x: gates_code_table[x] if x in gates_code_table else 0).rename('gate').astype(int)
        raw_data['gate'] = gatecode
        raw_data['next_gate'] = gatecode.shift(-1)

        gatecode = raw_data['gate']
        gatecode_onehotcode = gate_encoder.transform(gatecode.values.reshape(-1, 1)).toarray()
        gatecode_onehotcode = pd.DataFrame(gatecode_onehotcode, dtype='int').add_prefix('gate_')

        # weekday one hot encoding
        weekdaycode = week_encoder.transform(raw_data['week'].values.reshape(-1, 1)).toarray()
        weekdaycode = pd.DataFrame(weekdaycode, dtype='int').add_prefix('weekday_')

        # building one hot encoding
        buildingcode = raw_data['building'].astype(int)
        buildingcode_onehotcode = building_encoder.transform(buildingcode.values.reshape(-1, 1)).toarray()
        buildingcode_onehotcode = pd.DataFrame(buildingcode_onehotcode, dtype='int').add_prefix('building_')

        # Time feature
        data_list['hour'] = raw_data['datetime'].apply(lambda x: x.hour / 24)
        data_list['minute'] = raw_data['datetime'].apply(lambda x: x.minute / 60)
        data_list['second'] = raw_data['datetime'].apply(lambda x: x.second / 60)

        # IO code
        IOcode = raw_data['IO'].apply(lambda x: convert_IOcode(x))
        # join feature
        data_list = data_list.join(other=[IOcode, weekdaycode, buildingcode_onehotcode, gatecode_onehotcode])
        # match order
        data_list = data_list.dropna(how='any')
        target_list = raw_data['next_gate']

        data_list = data_list.values
        target_list = target_list.values.flatten()
        return data_list, target_list
Exemplo n.º 53
0
 def test_join_empty_bug(self):
     # generated an exception in 0.4.3
     x = DataFrame()
     x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
Exemplo n.º 54
0
 def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
     df_encoded = self.transform(df)
     return df.join(df_encoded)
Exemplo n.º 55
0
def align_srt_tshark_stats(stats: pd.DataFrame, rcv_tshark_csv: str):
    """
    Align SRT statistics and tshark data.

    Attributes:
        stats: 
            Aligned SRT statisitcs collected both at the receiver
            and sender sides, the output from align_srt_stats function.
        rcv_tshark_csv:
            Filepath to .csv thark data collected at the receiver side.
    """
    print('\nMerging tshark data with SRT statistics')

    # Extract SRT packets from .csv tshark dump file collected at the receiver side
    srt_packets = extract_srt_packets(rcv_tshark_csv)

    print('\nSRT packets extracted from receiver tshark dump')
    print(srt_packets.head(10))

    # Extract UMSG_ACK packets from SRT packets srt_packets that
    # contain receiving speed and bandwidth estimations reported by
    # receiver each 10 ms
    umsg_ack_packets = extract_umsg_ack_packets(srt_packets)

    print('\nUMSG_ACK packets extracted from SRT packets')
    print(umsg_ack_packets.head(10))

    # From umsg_ack_packets dataframe, extract features valuable 
    # for further analysis, do some data cleaning and timezone correction
    TSHARK_FEATURES = [
        'ws.no',
        'frame.time',
        'srt.rtt',
        'srt.rttvar',
        'srt.rate',
        'srt.bw',
        'srt.rcvrate'
    ]
    umsg_ack_packets = umsg_ack_packets[TSHARK_FEATURES]
    umsg_ack_packets = umsg_ack_packets.set_index('frame.time')
    umsg_ack_packets.index = umsg_ack_packets.index.tz_convert(None)
    umsg_ack_packets['srt.rtt'] = umsg_ack_packets['srt.rtt'] / 1000
    umsg_ack_packets['srt.rttvar'] = umsg_ack_packets['srt.rttvar'] / 1000
    umsg_ack_packets = umsg_ack_packets.rename(
        columns={
            'srt.rtt': 'srt.rtt.ms',
            'srt.rttvar': 'srt.rttvar.ms',
            'srt.rate': 'srt.rate.pkts',
            'srt.bw': 'srt.bw.pkts',
            'srt.rcvrate': 'srt.rate.Bps'
        }
    )
    umsg_ack_packets['srt.rate.Mbps'] = convert_bytesps_in_mbps(
        umsg_ack_packets['srt.rate.Bps']
    )
    umsg_ack_packets['srt.bw.Mbps'] = convert_bytesps_in_mbps(
        convert_pktsps_in_bytesps(umsg_ack_packets['srt.bw.pkts'])
    )
    umsg_ack_packets = umsg_ack_packets[
        [
            'ws.no',
            'srt.rtt.ms',
            'srt.rttvar.ms',
            'srt.rate.pkts',
            'srt.rate.Mbps',
            'srt.bw.pkts',
            'srt.bw.Mbps'
        ]
    ]

    print('\nAdjusted UMSG_ACK packets')
    print(umsg_ack_packets.head(10))
    print(umsg_ack_packets.tail(10))

    # Combine stats dataframe (with SRT statistics) and adjusted 
    # umsg_ack_packets dataframe. stats dataframe timepoints will be
    # further used as the timepoints for result dataframe
    start_timestamp = stats.index[0]
    end_timestamp = stats.index[-1]
    
    stats['isStats'] = True
    cols = ['srt.rtt.ms', 'srt.rttvar.ms', 'srt.rate.Mbps', 'srt.bw.Mbps']
    df = stats.join(umsg_ack_packets[cols].add_suffix('_tshark'), how='outer')
    df['isStats'] = df['isStats'].fillna(False)

    df = df[(df.index >= start_timestamp) & (df.index <= end_timestamp)]
    assert(df['isStats'][0] == True)
    assert(df['isStats'][-1] == True)

    print('\nJoined SRT stats and tshark statistics')
    print(df.head(10))
    print(df.tail(10))

    # Do interpolation
    cols_to_interpolate = [f'{col}_tshark' for col in cols]
    df.loc[:, cols_to_interpolate] = df.interpolate().fillna(method='bfill')
    df.loc[:, cols_to_interpolate] = df.round(2)

    print('\nInterpolated tshark statistics')
    print(df.head(10))
    print(df.tail(10))

    # Extract only stats dataframe timepoints (aligned SRT stats timepoints)
    df = df.loc[df['isStats'], df.columns != 'isStats']

    cols_to_int = [
        'pktSent_snd',
        'pktSndLoss_snd',
        'pktRecv_rcv',
        'pktRcvLoss_rcv',
    ]
    # TODO: Does not work
    # df.loc[:, cols_to_int] = df.astype('int32')
    for col in cols_to_int:
        df[col] = df[col].astype('int32')

    print('\nOnly SRT stats timepoints')
    print(df.head(10))
    print(df.tail(10))

    # Rearrange the columns
    cols_rearranged = [
        'pktSent_snd',
        'pktRecv_rcv',
        'pktSndLoss_snd',
        'pktRcvLoss_rcv',
        'msRTT_snd',
        'msRTT_rcv',
        'srt.rtt.ms_tshark',
        'srt.rttvar.ms_tshark',
        'mbpsBandwidth_snd',
        'mbpsBandwidth_rcv',
        'srt.bw.Mbps_tshark',
        # 'srt.rate.Mbps_tshark'
    ]
    df = df[cols_rearranged]

    return df
Exemplo n.º 56
0
 def transform(self, df: pd.DataFrame) -> pd.DataFrame:
     df_encoded = self._internal_encoder.transform(df[self.name])
     df_encoded = df_encoded.drop(columns=['intercept'], errors='ignore')
     df_encoded = self.update_column_names(df_encoded)
     return df.join(df_encoded)
Exemplo n.º 57
0
    def test_left_join_index_multi_match_multiindex(self):
        left = DataFrame(
            [
                ["X", "Y", "C", "a"],
                ["W", "Y", "C", "e"],
                ["V", "Q", "A", "h"],
                ["V", "R", "D", "i"],
                ["X", "Y", "D", "b"],
                ["X", "Y", "A", "c"],
                ["W", "Q", "B", "f"],
                ["W", "R", "C", "g"],
                ["V", "Y", "C", "j"],
                ["X", "Y", "B", "d"],
            ],
            columns=["cola", "colb", "colc", "tag"],
            index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8],
        )

        right = DataFrame(
            [
                ["W", "R", "C", 0],
                ["W", "Q", "B", 3],
                ["W", "Q", "B", 8],
                ["X", "Y", "A", 1],
                ["X", "Y", "A", 4],
                ["X", "Y", "B", 5],
                ["X", "Y", "C", 6],
                ["X", "Y", "C", 9],
                ["X", "Q", "C", -6],
                ["X", "R", "C", -9],
                ["V", "Y", "C", 7],
                ["V", "R", "D", 2],
                ["V", "R", "D", -1],
                ["V", "Q", "A", -3],
            ],
            columns=["col1", "col2", "col3", "val"],
        ).set_index(["col1", "col2", "col3"])

        result = left.join(right, on=["cola", "colb", "colc"], how="left")

        expected = DataFrame(
            [
                ["X", "Y", "C", "a", 6],
                ["X", "Y", "C", "a", 9],
                ["W", "Y", "C", "e", np.nan],
                ["V", "Q", "A", "h", -3],
                ["V", "R", "D", "i", 2],
                ["V", "R", "D", "i", -1],
                ["X", "Y", "D", "b", np.nan],
                ["X", "Y", "A", "c", 1],
                ["X", "Y", "A", "c", 4],
                ["W", "Q", "B", "f", 3],
                ["W", "Q", "B", "f", 8],
                ["W", "R", "C", "g", 0],
                ["V", "Y", "C", "j", 7],
                ["X", "Y", "B", "d", 5],
            ],
            columns=["cola", "colb", "colc", "tag", "val"],
            index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8],
        )

        tm.assert_frame_equal(result, expected)

        result = left.join(right,
                           on=["cola", "colb", "colc"],
                           how="left",
                           sort=True)

        expected = expected.sort_values(["cola", "colb", "colc"],
                                        kind="mergesort")

        tm.assert_frame_equal(result, expected)
Exemplo n.º 58
0
    def test_join_multi_levels(self):

        # GH 3662
        # merge multi-levels
        household = DataFrame(
            dict(
                household_id=[1, 2, 3],
                male=[0, 1, 0],
                wealth=[196087.3, 316478.7, 294750],
            ),
            columns=["household_id", "male", "wealth"],
        ).set_index("household_id")
        portfolio = DataFrame(
            dict(
                household_id=[1, 2, 2, 3, 3, 3, 4],
                asset_id=[
                    "nl0000301109",
                    "nl0000289783",
                    "gb00b03mlx29",
                    "gb00b03mlx29",
                    "lu0197800237",
                    "nl0000289965",
                    np.nan,
                ],
                name=[
                    "ABN Amro",
                    "Robeco",
                    "Royal Dutch Shell",
                    "Royal Dutch Shell",
                    "AAB Eastern Europe Equity Fund",
                    "Postbank BioTech Fonds",
                    np.nan,
                ],
                share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0],
            ),
            columns=["household_id", "asset_id", "name", "share"],
        ).set_index(["household_id", "asset_id"])
        result = household.join(portfolio, how="inner")
        expected = (DataFrame(
            dict(
                male=[0, 1, 1, 0, 0, 0],
                wealth=[
                    196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0
                ],
                name=[
                    "ABN Amro",
                    "Robeco",
                    "Royal Dutch Shell",
                    "Royal Dutch Shell",
                    "AAB Eastern Europe Equity Fund",
                    "Postbank BioTech Fonds",
                ],
                share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25],
                household_id=[1, 2, 2, 3, 3, 3],
                asset_id=[
                    "nl0000301109",
                    "nl0000289783",
                    "gb00b03mlx29",
                    "gb00b03mlx29",
                    "lu0197800237",
                    "nl0000289965",
                ],
            )).set_index([
                "household_id", "asset_id"
            ]).reindex(columns=["male", "wealth", "name", "share"]))
        tm.assert_frame_equal(result, expected)

        # equivalency
        result = merge(
            household.reset_index(),
            portfolio.reset_index(),
            on=["household_id"],
            how="inner",
        ).set_index(["household_id", "asset_id"])
        tm.assert_frame_equal(result, expected)

        result = household.join(portfolio, how="outer")
        expected = concat(
            [
                expected,
                (DataFrame(
                    dict(share=[1.00]),
                    index=MultiIndex.from_tuples(
                        [(4, np.nan)], names=["household_id", "asset_id"]),
                )),
            ],
            axis=0,
            sort=True,
        ).reindex(columns=expected.columns)
        tm.assert_frame_equal(result, expected)

        # invalid cases
        household.index.name = "foo"

        with pytest.raises(
                ValueError,
                match="cannot join with no overlapping index names"):
            household.join(portfolio, how="inner")

        portfolio2 = portfolio.copy()
        portfolio2.index.set_names(["household_id", "foo"])

        with pytest.raises(ValueError,
                           match="columns overlap but no suffix specified"):
            portfolio2.join(portfolio, how="inner")
Exemplo n.º 59
0
def add_dummies(data: pd.DataFrame, column: str):
    ohe = pd.get_dummies(data[column]).add_prefix(f'{column}_')
    data = data.drop(column, axis=1)
    data = data.join(ohe)
    return data
Exemplo n.º 60
0
 def insert_timeseries(self, df: pd.DataFrame, columns: list, timeseries: dict, interpolate=None, plot=False, title=None, columns_i: list=None, minType=None):
     if not interpolate:
         interpolate = self.INTERPOLATE
     i_date = copy.copy(self.START_DATE)
     td = timedelta(hours=1) if self.TIMESTEP == "hourly" else timedelta(days=1)  # hourly or daily
     data = []
     c = len(columns_i) if columns_i else len(columns)
     missing_data = [np.nan for i in range(0, c)]
     while i_date <= self.END_DATE:
         values = []
         datestamp = i_date.strftime("%Y-%m-%d %H")
         if datestamp in timeseries.keys():
             if columns_i:
                 for i in columns_i:
                     if minType:
                         v = datetime.strptime(timeseries[datestamp][i], minType)
                     else:
                         v = float(timeseries[datestamp][i])
                     if int(v) == -9998 or int(v) == -9999:
                         values.append(np.nan)
                     else:
                         values.append(v)
             else:
                 for v in timeseries[datestamp]:
                     v = float(v)
                     if int(v) == -9998 or int(v) == -9999:
                         values.append(np.nan)
                     else:
                         values.append(v)
         else:
             values = missing_data
         data.append(values)
         i_date = i_date + td
     for i in range(0, len(data)):
         data[i] = np.asarray(data[i], dtype=np.float64)
     temp_data = data.copy()
     data_df = pd.DataFrame(data, columns=columns, dtype=np.float64)
     merge = True
     for c in columns:
         if interpolate in ["linear", "slinear", "quadratic", "cubic", "values"]:
             data_df[c] = data_df[c].interpolate(method=interpolate).ffill().bfill()
         elif interpolate in ["polynomial", "spline"]:
             data_df[c] = data_df[c].interpolate(method=interpolate, order=4).ffill().bfill()
         elif interpolate == "gaussian":
             merge = False
             df = df.join(data_df, how='outer')
             df = self.random_gaussian(df, columns)
         else:
             data_df[c] = data_df[c].fillna(method=interpolate).ffill().bfill()
     if merge:
         df = df.join(data_df, how='outer')
     if plot:
         plot_data = pd.DataFrame()
         plot_columns = columns
         for i in range(0, len(columns)):
             c = columns[i]
             c0 = c + "_0"
             d_i = df[c]
             plot_data[c] = temp_data[:, i]
             plot_data[c0] = d_i
             plot_columns.append(c0)
         x = pd.to_datetime(df[["year", "month", "day", "hour"]])
         plot_data["datetime"] = x
         plot_data.set_index('datetime')
         colors = ['b', 'm', 'g', 'c', 'y', 'k']
         ax = plot_data.plot(x='datetime', y=plot_columns[0], linewidth=1.0, label=plot_columns[0], color=colors[0], figsize=(16, 8))
         plot_data.plot(x='datetime', y=plot_columns[0], linewidth=1.0, label=plot_columns[0], color=colors[0],
                        figsize=(16, 8))
         for c in range(1, len(plot_columns)):
             plot_data.plot(x='datetime', y=plot_columns[c], linewidth=1.0, label=plot_columns[c], color=colors[0],
                            figsize=(16, 8))
             plot_data.plot(x='datetime', y=plot_columns[c], linewidth=0.5, label=plot_columns[c], color=colors[c], ax=ax)
         ax.set_title("{} - {} interpolation".format(title, interpolate))
         plt.show()
     return df