def test_select_on_unambiguous_asof_join(func, npartitions): df_t = dd.from_pandas( pd.DataFrame({ 'a0': [1, 2, 3], 'b1': date_range("20180101", periods=3) }), npartitions=npartitions, ) df_s = dd.from_pandas( pd.DataFrame({ 'a1': [2, 3, 4], 'b2': date_range("20171230", periods=3) }), npartitions=npartitions, ) con = ibis.dask.connect({"t": df_t, "s": df_s}) t = con.table("t") s = con.table("s") join = t.asof_join(s, t.b1 == s.b2) expected = dd.merge_asof(df_t, df_s, left_on=["b1"], right_on=["b2"])[["a0", "a1"]] assert not expected.compute(scheduler='single-threaded').empty expr = func(join) result = expr.compile() tm.assert_frame_equal( result.compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_context_adjustment_asof_join(time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2): expr = time_keyed_left.asof_join( time_keyed_right, 'time', by='key', tolerance=4 * ibis.interval(days=1))[time_keyed_left, time_keyed_right.other_value] context = (Timestamp('20170105'), Timestamp('20170111')) result = expr.execute(timecontext=context) # compare with asof_join of manually trimmed tables trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][ time_keyed_df1['time'] < context[1]] trimmed_df2 = time_keyed_df2[time_keyed_df2['time'] >= context[0] - Timedelta(days=4)][ time_keyed_df2['time'] < context[1]] expected = dd.merge_asof( trimmed_df1, trimmed_df2, on='time', by='key', tolerance=Timedelta('4D'), ).compute() tm.assert_frame_equal(result, expected)
def execute_asof_join(op, left, right, by, tolerance, predicates, **kwargs): overlapping_columns = frozenset(left.columns) & frozenset(right.columns) left_on, right_on = _extract_predicate_names(predicates) left_by, right_by = _extract_predicate_names(by) _validate_columns(overlapping_columns, left_on, right_on, left_by, right_by) assert 0 <= len(left_on) <= 1, f"len(left_on) == {len(left_on)}" assert 0 <= len(right_on) <= 1, f"len(right_on) == {len(right_on)}" on = left_on if left_on == right_on else None return dd.merge_asof( left=left, right=right, # NB: dask 2022.4.1 contains a bug from # https://github.com/dask/dask/pull/8857 that keeps a column if `on` is # non-empty without checking whether `left_on` is non-empty, this # check works around that on=on, left_on=left_on if on is None else None, right_on=right_on if on is None else None, left_by=left_by or None, right_by=right_by or None, tolerance=tolerance, )
def test_asof_join(time_left, time_right, time_df1, time_df2): expr = time_left.asof_join(time_right, 'time')[time_left, time_right.other_value] result = expr.compile() expected = dd.merge_asof(time_df1, time_df2, on='time') tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_adjust_context_complete_shift( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2, ): """Test `adjust_context` function that completely shifts the context. This results in an adjusted context that is NOT a subset of the original context. This is unlike an `adjust_context` function that only expands the context. See #3104 """ # Create a contrived `adjust_context` function for # CustomAsOfJoin to mock this. @adjust_context.register(CustomAsOfJoin) def adjust_context_custom_asof_join( op: ops.AsOfJoin, timecontext: TimeContext, scope: Optional[Scope] = None, ) -> TimeContext: """Shifts both the begin and end in the same direction.""" begin, end = timecontext timedelta = execute(op.tolerance) return (begin - timedelta, end - timedelta) expr = CustomAsOfJoin( left=time_keyed_left, right=time_keyed_right, predicates='time', by='key', tolerance=ibis.interval(days=4), ).to_expr() expr = expr[time_keyed_left, time_keyed_right.other_value] context = (Timestamp('20170101'), Timestamp('20170111')) result = expr.execute(timecontext=context) # Compare with asof_join of manually trimmed tables # Left table: No shift for context # Right table: Shift both begin and end of context by 4 days trimmed_df1 = time_keyed_df1[time_keyed_df1['time'] >= context[0]][ time_keyed_df1['time'] < context[1]] trimmed_df2 = time_keyed_df2[ time_keyed_df2['time'] >= context[0] - Timedelta(days=4)][ time_keyed_df2['time'] < context[1] - Timedelta(days=4)] expected = dd.merge_asof( trimmed_df1, trimmed_df2, on='time', by='key', tolerance=Timedelta('4D'), ).compute() tm.assert_frame_equal(result, expected)
def execute_asof_join(op, left, right, tolerance, **kwargs): overlapping_columns = frozenset(left.columns) & frozenset(right.columns) left_on, right_on = _extract_predicate_names(op.predicates) left_by, right_by = _extract_predicate_names(op.by) _validate_columns(overlapping_columns, left_on, right_on, left_by, right_by) return dd.merge_asof( left=left, right=right, left_on=left_on, right_on=right_on, left_by=left_by or None, right_by=right_by or None, tolerance=tolerance, )
def test_keyed_asof_join_with_tolerance( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 ): expr = time_keyed_left.asof_join( time_keyed_right, 'time', by='key', tolerance=2 * ibis.interval(days=1) )[time_keyed_left, time_keyed_right.other_value] result = expr.compile() expected = dd.merge_asof( time_keyed_df1, time_keyed_df2, on='time', by='key', tolerance=Timedelta('2D'), ) tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def main_create_giga_ds(URL): syms = download_all_dataframes.return_dictonaries_of_stock_tickers(URL) syms = list(syms.values()) i = 0 for sym in syms: if i == 0: df = read_pq(sym) logger.info("ticker") logger.info(sym) df = dd.from_pandas(df, npartitions=3) old_df = df i = i + 1 else: df = read_pq(sym) df = dd.from_pandas(df, npartitions=3) old_df = dd.merge_asof(old_df, df, left_index=True, right_index=True) df = clean_final_df_cols(old_df, syms) assert len(df.columns) == 200, "columns have not been dropped" logger.info("Number of columns") logger.info(len(df.columns)) df = df.apply(pd.to_numeric, axis=1, errors='coerce') df = df.compute() print(df.head(6)) df.to_parquet( r"C:\Users\shawn paul\Desktop\PyFinanceProj\NASDAQPrediction\stored_data", engine='pyarrow')