def get_combined_frame(input_frame, result_frame, include_on_out_fields): combined_frame = dt.Frame() for include_column in include_on_out_fields: combined_frame = dt.cbind(combined_frame, input_frame[include_column]) combined_frame = dt.cbind(combined_frame, result_frame) return combined_frame
def test_cbind_issue2024(): DT = dt.Frame([[]] * 2, names=["A.1", "A.5"]) with pytest.warns(DatatableWarning): RZ = dt.cbind(DT, DT) assert RZ.names == ("A.1", "A.5", "A.2", "A.6") RZ = dt.cbind(DT, DT, DT) assert RZ.names == ("A.1", "A.5", "A.2", "A.6", "A.3", "A.7") RZ = dt.cbind(DT, DT, DT, DT) assert RZ.names == ("A.1", "A.5", "A.2", "A.6", "A.3", "A.7", "A.4", "A.8") RZ = dt.cbind(DT, DT, DT, DT, DT) assert RZ.names == ("A.1", "A.5", "A.2", "A.6", "A.3", "A.7", "A.4", "A.8", "A.9", "A.10")
def test_issue1905(): # cbind() is passed a generator, where each generated Frame is a # temporary. In this case cbind() should take care to keep the # references to all frames while working, lest they get gc-d by # the end of the generator loop. DT = dt.cbind(dt.Frame(range(50), names=[str(i)]) for i in range(30)) assert DT.shape == (50, 30)
def test_issue2055(numpy): DT = dt.cbind(dt.Frame(A=[1, 2]), dt.Frame(numpy.ma.array([True, True], mask=[False, False]))) DT.nrows = 1 DT = DT.copy() frame_integrity_check(DT) assert DT.to_list() == [[1], [True]]
def test_issue1921(): n = 1921 DTA = dt.Frame(A=range(n)) DTB = dt.repeat(dt.Frame(B=["hey"], stype=dt.str64), n) DT = dt.cbind(DTA, DTB) out = DT.to_csv() assert out == "\n".join(["A,B"] + ["%d,hey" % i for i in range(n)] + [""])
def test_columnset_sum(DT): assert_equals(DT[:, f[int].extend(f[float])], DT[:, [int, float]]) assert_equals(DT[:, f[:3].extend(f[-3:])], DT[:, [0, 1, 2, -3, -2, -1]]) assert_equals( DT[:, f['A','B','C'].extend(f['E','F', 'G'])], DT[:, [0, 1, 2, -3, -2, -1]]) assert_equals(DT[:, f.A.extend(f.B)], DT[:, ['A', 'B']]) assert_equals(DT[:, f[:].extend({"extra": f.A + f.C})], dt.cbind(DT, DT[:, {"extra": f.A + f.C}]))
def py_tidy_descriptive_stats(DT): """Generate summary statistics of datatable""" datos_dict = DT.to_dict() summary_stats_of_dict = { k: [ np.nanmean(v), np.nanmedian(v), np.nanmin(v), np.nanmax(v), np.nanstd(v), np.percentile(v, 25, interpolation='midpoint'), np.percentile(v, 75, interpolation='midpoint'), np.percentile(v, 75, interpolation='midpoint') - np.percentile(v, 25, interpolation='midpoint'), np.nanstd(v) / np.sqrt(np.shape(v)[0]) ] for k, v in datos_dict.items() } summary_dict_names = dt.Frame({ 'descriptive_stats': ['Mean', 'Median', 'Min', 'Max', 'Std', 'Q1', 'Q3', 'IQR', 'SE'] }) summary_stats_of_dict_prep = { k: list(map(lambda x: np.round(x, 3), v)) for k, v in summary_stats_of_dict.items() } summary_stat_dt = dt.Frame(summary_stats_of_dict_prep) return dt.cbind(summary_dict_names, summary_stat_dt)
def test_materialize(): DT1 = dt.Frame(A=range(12))[::2, :] DT2 = dt.repeat(dt.Frame(B=["red", "green", "blue"]), 2) DT3 = dt.Frame(C=[4, 2, 9.1, 12, 0]) DT = dt.cbind(DT1, DT2, DT3, force=True) assert frame_columns_virtual(DT) == (True, True, True) DT.materialize() assert frame_columns_virtual(DT) == (False, False, False)
def test_aggregate_3d_fixed_small_radius(): DT = dt.Frame([range(10)] * 3) [DTE, DTM] = aggregate(DT, min_rows=0, nd_max_bins=1, fixed_radius=0.1) DTE_ref = cbind( DT, dt.Frame([1] * 10 / dt.stype.int32, names=["members_count"])) DTM_ref = dt.Frame(range(10), names=["exemplar_id"]) assert_equals(DTE, DTE_ref) assert_equals(DTM, DTM_ref)
def test_cbind_method(): d0 = dt.Frame({"A": [1, 2, 3]}) d1 = dt.Frame({"B": list('abc')}) d2 = dt.Frame({"C": [5.6, 7.1, -3.3]}) dr = dt.cbind(d0, d1, d2) assert dr.names == ("A", "B", "C") res = dt.Frame([[1, 2, 3], ["a", "b", "c"], [5.6, 7.1, -3.3]], names=("A", "B", "C")) assert_equals(dr, res)
def test_cbind_notinplace(): d0 = dt.Frame({"A": [1, 2, 3]}) d1 = dt.Frame({"B": [4, 5, 6]}) dt_compute_stats(d0, d1) dd = dt.cbind(d0, d1) dr = dt.Frame({"A": [1, 2, 3], "B": [4, 5, 6]}) assert_equals(dd, dr) assert_equals(d0, dt.Frame({"A": [1, 2, 3]})) assert_equals(d1, dt.Frame({"B": [4, 5, 6]}))
def test_debug_logger_default_with_report_args(capsys): assert dt.options.debug.logger is None with dt.options.debug.context(enabled=True, report_args=True): assert dt.options.debug.logger is None assert dt.options.debug.enabled is True DT = dt.Frame(range(100000)) out, err = capsys.readouterr() print(out) assert not err assert re.search(r"<Frame#[\da-fA-F]+>.__init__\(range\(0, 100000\)\)", out) assert re.search(r"# \d+(?:\.\d+)?(?:[eE][+-]?\d+)? s", out) with pytest.raises(TypeError): dt.cbind(3) out, err = capsys.readouterr() assert not err assert "datatable.cbind(3) {" in out assert re.search(r"} # \d+(?:\.\d+)?(?:[eE][+-]?\d+)? s \(failed\)", out)
def test_materialize(): DT1 = dt.Frame(A=range(12))[::2, :] DT2 = dt.repeat(dt.Frame(B=["red", "green", "blue"]), 2) DT3 = dt.Frame(C=[4, 2, 9.1, 12, 0]) DT = dt.cbind(DT1, DT2, DT3, force=True) assert frame_column_rowindex(DT, 0).type == "slice" assert frame_column_rowindex(DT, 1).type == "arr32" assert frame_column_rowindex(DT, 2) is None DT.materialize() assert frame_column_rowindex(DT, 0) is None assert frame_column_rowindex(DT, 1) is None assert frame_column_rowindex(DT, 2) is None
def test_topandas_view_mixed(): d0 = dt.Frame(A=range(100)) d1 = d0[7:17, :] d2 = dt.Frame(B=['foo', 'bar', 'buzz'] * 3 + ['finale']) d3 = dt.Frame(V=[2.2222]) d3.nrows = 10 dd = dt.cbind(d1, d2, d3) pp = dd.to_pandas() assert pp.columns.tolist() == ["A", "B", "V"] assert pp["A"].tolist() == list(range(7, 17)) assert pp["B"].tolist() == d2.to_list()[0] assert pp["V"].tolist()[0] == 2.2222 assert all(math.isnan(x) for x in pp["V"].tolist()[1:])
def pd_dt_concat(frames, axis=0): """ Concatenate sequence of datatable Frames or pandas DataFrames `frames` along `axis` (0 means rows, 1 means columns). """ if USE_DT: if axis == 0: return dt.rbind(*frames) elif axis == 1: return dt.cbind(*frames) else: raise ValueError('invalid axis:', axis) else: return pd.concat(frames, axis=axis)
def write_table(df, name, output_dir, add_index=True): """ Add a primary key to df ('id' column) and write it to output_dir as a .csv file. @param df: [`datatable.Frame`] A PharmacoDB table @param name: [`string`] The name of the table @param output_dir: [`string`] The directory to write the table to @return: [`datatable.Frame`] The indexed PharmacoDB table """ print(f'Writing {name} table to {output_dir}...') if add_index: # Index datatable df = cbind(dt.Frame(id=np.arange(df.nrows) + 1), df) # Write to .csv df.to_csv(os.path.join(output_dir, f'{name}.csv')) return df
def pydt_reshape_wide_to_long(DT,*measure_vars,var_name=None,val_name=None): """reshaping datatable from wide columns to long """ dt_cols=[*measure_vars] measure_col_dict = DT[:,[*measure_vars]].to_dict() variables_dict={'variable':[],'value':[]} for k,v in measure_col_dict.items(): variables_dict['variable'].extend(repeat(k,len(v))) variables_dict['value'].extend(v) wide_to_long_dt = dt.Frame(variables_dict) removed_cols_dt = DT[:,f[:].remove([ f[col] for col in dt_cols])].to_dict() non_measures_dt = dt.Frame({k:list(chain.from_iterable(list(repeat(v,len(dt_cols))))) for k,v in removed_cols_dt.items()}) if var_name and val_name is not None: wide_to_long_dt.names={'variable':var_name,'value':val_name} wide_to_long_prep_dt=dt.cbind(non_measures_dt,wide_to_long_dt) return wide_to_long_prep_dt
def test_cbind_api(): DT1 = dt.Frame(A=[1, 2, 3]) DT2 = dt.Frame(B=[-4, -5, None]) DT3 = dt.Frame(X=["makes", "gonna", "make"]) RES1 = dt.cbind(DT1, DT2, DT3) RES2 = dt.cbind([DT1, DT2, DT3]) RES3 = dt.cbind((DT1, DT2, DT3)) # tuple RES4 = dt.cbind([DT1], [DT2, DT3]) RES5 = dt.cbind(DT1, [DT2], DT3) RES6 = dt.cbind((frame for frame in [DT1, DT2, DT3])) # generator assert_equals(RES1, RES2) assert_equals(RES1, RES3) assert_equals(RES1, RES4) assert_equals(RES1, RES5) assert_equals(RES1, RES6)
def dtm_to_datatable(dtm, doc_labels, vocab, colname_rowindex='_doc'): """ Convert a (sparse) DTM to a datatable Frame using document labels `doc_labels` as row idenitifier (with column name `colname_rowindex`) and `vocab` as column names. .. seealso:: :func:`~tmtoolkit.bow.dtm.dtm_to_dataframe` for generating a pandas DataFrame. :param dtm: (sparse) document-term-matrix of size NxM (N docs, M is vocab size) with raw terms counts :param doc_labels: document labels used as row index (row names); size must equal number of rows in `dtm` :param vocab: list or array of vocabulary used as column names; size must equal number of columns in `dtm` :param colname_rowindex: column name for row identifier (i.e. column where the document labels are put) :return: datatable Frame """ if not USE_DT: raise RuntimeError('this function requires the package "datatable" to be installed') import datatable as dt if dtm.ndim != 2: raise ValueError('`dtm` must be a 2D array/matrix') if dtm.shape[0] != len(doc_labels): raise ValueError('number of rows must be equal to `len(doc_labels)') if dtm.shape[1] != len(vocab): raise ValueError('number of rows must be equal to `len(vocab)') if isinstance(dtm, np.matrix): dtm = dtm.A if not isinstance(dtm, np.ndarray): dtm = dtm.toarray() return dt.cbind(dt.Frame({colname_rowindex: doc_labels}), dt.Frame(dtm, names=list(vocab)))
def test_cbind_nones(): DT = dt.cbind(None, dt.Frame(A=range(5)), None, dt.Frame(B=[0] * 5)) assert_equals(DT, dt.Frame(A=range(5), B=[0] * 5))
}) ##### Notes: * Now have a pandas DF **seattle_dates_df** with these columns- year,month,day,hour and week_day, and it needs to be converted to a DT for further analysis * Here, a DF should be passed to **dt.Frame()** method and assigned to a new DT with name as illustrated below # Convering date df to dt seatle_dates_dt = dt.Frame(seatle_dates_df) * In our original seattle DT we no more require date column in string format, so better to remove it from DT # delating a string date column from dt del seattle_bikes_dt['date'] * The DT's seattle_dates_dt and seattle_bikes_dt will be concatenated with the help of **dt.cbind()** function # Concatinating two dts to have a tidy dt seatle_bikes_dt_tidy = dt.cbind(seatle_dates_dt,seattle_bikes_dt) # Tidy DT first 4 and last 4 observations seatle_bikes_dt_tidy ##### Notes: * We can see that a column crossing has a different categorical level values, and 3 of them are having longer names, to make them shorter their values should be updated/modified with shorter string values. * In Datatable it can be achived as done in below 4 code chunks. * here is a simple syntax for this: DT[column value condition, the column to be updated]='new value' # Modifying observations of crossing col - set - 1 seatle_bikes_dt_tidy[f.crossing=="39th Ave NE Greenway at NE 62nd St",f.crossing]='Greenwayway-NE-62Strt'
def join(names1, names2): with pytest.warns(DatatableWarning): return dt.cbind(dt.Frame(names=names1), dt.Frame(names=names2)).names
def test_cbind_0rows_3(): DT0 = dt.Frame(A=[], B=[], C=[]) RES1 = dt.cbind(dt.Frame(), DT0) RES2 = dt.cbind(DT0, dt.Frame()) assert_equals(RES1, DT0) assert_equals(RES2, DT0)
def test_cbind_0rows_1(): """Issue #1604.""" res = dt.cbind(dt.Frame(A=[]), dt.Frame(B=[])) assert res.names == ("A", "B") assert res.shape == (0, 2)
def create_data(X: dt.Frame = None): """ Convert transactional data to i.i.d. data by making time-based aggregations """ if X is None: X = TransactionalToIID.make_transactions() if not make_features_from_scratch: return {'raw_transactions_non_iid': X} if X is None: return [] X_pd = X[:, [col_date, col_group, target]].to_pandas() # faster, since only working on a few cols X_pd[col_row_id] = np.arange(X_pd.shape[0]) y = X_pd[target] y_enc = target + ".enc" # Create boolean target X_pd[y_enc] = (y == target_labels[1]).astype(int) # Make sure time is datetime64, not string X_pd[col_date] = pd.to_datetime(X_pd[col_date]) for leak in leaky_choices: # Create the groups groups = X_pd.groupby(col_group) shift_amount = 0 if leak else 1 # this is critical to avoid leaks! DO NOT SET IT TO 0 IN PRODUCTION! # Compute aggregation over time for t in window_length_days: t_days = str(t) + "d" # pandas will do rolling window over this many days ('5d' etc.) for op in operators: lag_feature = [] for _, df in groups: df = df.sort_values(col_date) time_window = df.set_index(col_date)[y_enc].shift(shift_amount). \ dropna().rolling(t_days, min_periods=1) # get time window. if leaky, includes self res = getattr(time_window, op)() # apply operator on time window res.index = df.index[shift_amount:] lag_feature.append(res) # Index is set on both side so equal works and reorders rows automatically X_pd["%s%s_%s_past_%d_days_grouped_by_%s" % ("leaky_" if leak else "", op, target, t, col_group)] = pd.concat(lag_feature, axis=0) del X_pd[y_enc] # delete temporary binary response column # delete grouping column, since have all aggregations already in iid form del X_pd[col_group] del X[col_group] # create datatable frame of new features (only) X_features = dt.Frame(X_pd.loc[:, [x for x in X_pd.columns if x not in [col_date, target, col_row_id]]]) # add new features to original frame X_new = dt.cbind(X, X_features) out = {} for name, time_range in { # 2-way split: ideal for iid, let Driverless do internal validation splits on training split 'train_iid': X_pd[col_date] <= split_date, 'test_iid': X_pd[col_date] > split_date }.items(): # X_pd is pandas - easier to deal with time slices, and keep row_id to index into datatable below which_rows = X_pd.loc[time_range, col_row_id].reset_index(drop=True).values if shuffle: np.random.shuffle(which_rows) # shuffle data for generality - no impact on iid modeling name += ".shuf" for leak in leaky_choices: X_out = X_new.copy() # shallow copy if leak: cols_to_del = [x for x in X_features.names if "leaky" != x[:5]] else: cols_to_del = [x for x in X_features.names if "leaky" == x[:5]] del X_out[:, cols_to_del] out[name + (".leaky" if leak else "")] = X_out[which_rows, :] return out
py_dt_two_group_proportions_summary(policia_tidy_dt,'driver_race','is_arrested') # stop time stop_time_df = policia_tidy_dt[:,(f.stop_time)].to_pandas() # extracting hour stop_time_hour = stop_time_df.stop_time.str.extract(r'([\d]{2})') # a new dataframe stop_time_hour_dt = dt.Frame(stop_time_hour) # change a col name stop_time_hour_dt.names={'0':'stop_hour'} # Binding two dts policia_tidy_dt_v1 = dt.cbind(policia_tidy_dt,stop_time_hour_dt) # Hour wise arrests hour_wise_arrests_dt = py_dt_two_group_proportions_summary(policia_tidy_dt_v1,'stop_hour','is_arrested') # Visualization alt.Chart(hour_wise_arrests_dt.to_pandas()).mark_bar().encode( alt.X('stop_hour:N'), alt.Y('count'), alt.Color('is_arrested') ).properties( title= 'Hour wise arrest trends' ) # Hour wise arrest rates
amigos_info_dt[:,dt.update(temp=f.directed_by==f.written_by)] # are the directors and writers same for a title ? amigos_info_dt[f.temp==True,:] # remove the temp col del amigos_info_dt["temp"] # split writers column writers_list = [ elemento.split('&') for elemento in amigos_info_dt[:,f.written_by].to_list()[0] ] # create a new DT with writers writers_dt = dt.Frame({'no_of_writers':[len(elem) for elem in writers_list]}) # Joining two DTs amigos_info_dt_v1 = dt.cbind(amigos_info_dt,writers_dt) # No of writers alt.Chart(amigos_info_dt_v1[:,count(),by(f.no_of_writers)].to_pandas()).mark_bar().encode( alt.X('count'), alt.Y('no_of_writers:O') ).properties( title='Number of writers in titles' ) amigos_year = dt.Frame({'year':[re.findall(r'[\d]{4}',fecha)[0] for fecha in amigos_info_dt_v1[:,f.air_date].to_list()[0] ]}) alt.Chart(amigos_year[:,count(),by(f.year)].to_pandas()).mark_line().encode(alt.X('year'),alt.Y('count'))
def test_cbind_empty3(): DT = dt.cbind() assert_equals(DT, dt.Frame())
def test_create_from_doublestar_expansion(): DT0 = dt.Frame(A=range(3), B=["df", "qe;r", None]) DT1 = dt.Frame(D=[7.99, -12.5, 0.1], E=[None]*3) DT = dt.Frame(**DT0, **DT1) assert_equals(DT, dt.cbind(DT0, DT1))
def test_cbind_expanded_frame(): DT = dt.Frame(A=[1, 2], B=['a', "E"], C=[7, 1000], D=[-3.14, 159265]) RES = dt.cbind(*DT) assert_equals(DT, RES)