def test_errors_for_merge_on_frame_columns(): a = pd.DataFrame({'x': [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5]) b = pd.DataFrame({'y': [1, 2, 3, 4, 5]}, index=[5, 4, 3, 2, 1]) aa = dd.from_pandas(a, npartitions=3, sort=False) bb = dd.from_pandas(b, npartitions=2) with pytest.raises(NotImplementedError): dd.merge(aa, bb, left_on='x', right_on=bb.y) with pytest.raises(NotImplementedError): dd.merge(aa, bb, left_on=aa.x, right_on=bb.y)
def test_merge(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) list_eq(dd.merge(a, b, left_index=True, right_index=True), pd.merge(A, B, left_index=True, right_index=True)) list_eq(dd.merge(a, b, on='y'), pd.merge(A, B, on='y')) list_eq(dd.merge(a, b, left_on='x', right_on='z'), pd.merge(A, B, left_on='x', right_on='z')) list_eq(dd.merge(a, b), pd.merge(A, B)) list_eq(dd.merge(a, B), pd.merge(A, B)) list_eq(dd.merge(A, b), pd.merge(A, B)) list_eq(dd.merge(A, B), pd.merge(A, B)) list_eq(dd.merge(a, b, left_index=True, right_index=True), pd.merge(A, B, left_index=True, right_index=True))
def execute_cross_join(op, left, right, **kwargs): """Execute a cross join in dask. Notes ----- We create a dummy column of all :data:`True` instances and use that as the join key. This results in the desired Cartesian product behavior guaranteed by cross join. """ # generate a unique name for the temporary join key key = "cross_join_{}".format(ibis.util.guid()) join_key = {key: True} new_left = left.assign(**join_key) new_right = right.assign(**join_key) # inner/outer doesn't matter because every row matches every other row result = dd.merge( new_left, new_right, how='inner', on=key, suffixes=constants.JOIN_SUFFIXES, ) # remove the generated key del result[key] return result
def combine_data( songwriter_df_path=paths["songwriter_df_path"], compressed_genre_path=paths["compressed_genre_path"], pitch_timbre_df_path=paths["segment_path"], song_features_path=paths["song_features_path"], ): """Creates dask dataframe of songs with features ready for modeling""" list_of_paths = [ songwriter_df_path, compressed_genre_path, pitch_timbre_df_path, song_features_path, ] latest_file_list = list(map(find_latest_file, list_of_paths)) songwriter_df = ( dd.read_csv( latest_file_list[0], # should be pd.Int32Dtype() but running into error dtype={ "IPI": np.float64 }, ).rename(columns={ "Unnamed: 0": "index" }).set_index("index")) compressed_genre_df = (dd.read_csv( latest_file_list[1]).rename(columns={ "Unnamed: 0": "index" }).set_index("index")) pitch_timbre_df = dd.read_csv( latest_file_list[2]).rename(columns={"Unnamed: 0": "track_id"}) song_features_df = (dd.read_csv( latest_file_list[3]).rename(columns={ "Unnamed: 0": "index" }).set_index("index")) songwriter_and_genres = dd.merge(songwriter_df, compressed_genre_df, on="track_id") songwriter_genres_and_pt = dd.merge(songwriter_and_genres, pitch_timbre_df, on="track_id") ready_for_modeling_df = dd.merge(songwriter_genres_and_pt, song_features_df, on="track_id") return ready_for_modeling_df
def process_introns(data_dir, num_samples, num_threads=4): def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] dfs = [] for i in range(num_samples): columns = ["chromosome", "start", "end", f"{i+1}_count", "strand"] if os.path.exists(data_dir / f'sample_{i+1}.splice.gz'): filename = data_dir / f'sample_{i+1}.splice.gz' _df = dd.read_csv(filename, sep=' ', blocksize=None, names=columns, usecols=[0, 1, 2, 3, 4], compression='gzip') elif os.path.exists(data_dir / f'sample_{i+1}.splice'): filename = data_dir / f'sample_{i+1}.splice' _df = dd.read_csv(filename, sep=' ', blocksize=None, names=columns, usecols=[0, 1, 2, 3, 4]) else: raise Exception("Splice file doesn't exist!") # drop the negative read counts if any _df = _df[_df[f"{i+1}_count"] >= 0] dfs.append(_df) while len(dfs) > 1: _list = [] for chunk in chunks(dfs, 5): df = delayed(reduce)(lambda x, y: dd.merge( x, y, how='outer', on=['chromosome', 'start', 'end', 'strand' ]), chunk) _list.append(df) dfs = _list df = compute(*dfs, num_workers=num_threads)[0] df.fillna(0, inplace=True) if num_samples > 10: column_names = list( set(df.columns.values) - set(['chromosome', 'start', 'end', 'strand'])) df = df[(df[column_names] > 3).any(axis=1)] coord_columns = ['chromosome', 'strand', 'start', 'end'] index_df = df[coord_columns].copy() index_df['index'] = df[coord_columns].apply(lambda x: tuple(x), axis=1) index_df.set_index(coord_columns, inplace=True) df['index'] = df[coord_columns].apply(lambda x: tuple(x), axis=1) df.drop(coord_columns, axis=1, inplace=True) df.set_index('index', inplace=True) return df, index_df
def count_cross_feat_hour(df, feat_1, feat_2): cname = feat_1 + "_" + feat_2 + "hour" add = df.groupby( [feat_1, feat_2, "hour"], sort=False).size().reset_index().rename(columns={0: cname}) df = dd.merge(df, add, 'left', on=[feat_1, feat_2, "hour"]) df[cname] = df[cname].astype(np.int32) return df
def gen_is_first_feat(train_data, feat): train_data_2 = train_data.sort_values(by=["user_id", feat, "context_timestamp"], ascending=True) first = train_data_2.drop_duplicates(["user_id", feat]) first['is_first_user_' + feat] = 1 first = first[["user_id", feat, "context_timestamp", 'is_first_user_' + feat]] train_data = dd.merge(train_data, first, how="left", on=["user_id", feat, "context_timestamp"]) train_data = train_data.fillna({'is_first_user_' + feat: 0}) first = first.rename(columns={"context_timestamp": "is_first_time_gap_" + feat})[ ["user_id", feat, "is_first_time_gap_" + feat]] train_data = dd.merge(train_data, first, on=["user_id", feat], how="left") train_data["is_first_time_gap_" + feat] = ( train_data["is_first_time_gap_" + feat] - train_data["context_timestamp"]).dt.total_seconds() train_data["is_first_time_gap_" + feat] = train_data["is_first_time_gap_" + feat].astype(np.int32) train_data['is_first_user_' + feat] = train_data['is_first_user_' + feat].astype(np.int32) del train_data_2, first return train_data
def add_category(wallets, df): wallet_owners = wallets[['owner', 'category']].drop_duplicates( subset='owner', keep='last').reset_index(drop=True) sender = dd.merge(df, wallet_owners, left_on='sender_name', right_on='owner', how='left') columns = [ 'receiver_name', 'receiver_category', 'sender_name', 'sender_category' ] sender = sender.drop(columns, axis=1) sender = sender.rename(columns={ "owner": "sender_name", "category": "sender_category" }) receiver = receiver = dd.merge(df, wallet_owners, left_on='receiver_name', right_on='owner', how='left') columns = [ 'sender_name', 'sender_category', 'receiver_name', 'receiver_category' ] receiver = receiver.drop(columns, axis=1) receiver = receiver.rename(columns={ "owner": "receiver_name", "category": "receiver_category" }) tnx_category = dd.merge(sender, receiver, how='inner', on=[ 'hash', 'block_timestamp', 'sender', 'receiver', 'date', 'btc', 'dollar', 'percent_marketcap', 'PriceUSD' ]) tnx_category = tnx_category[[ 'hash', 'block_timestamp', 'sender', 'receiver', 'btc', 'dollar', 'PriceUSD', 'percent_marketcap', 'sender_name', 'sender_category', 'receiver_name', 'receiver_category' ]] return tnx_category
def main(): # create two dask dataframes (optimized for large datasets) df1 = dd.read_csv(FILE1) df2 = dd.read_csv(FILE2) # merge them by doing inner join operation df3 = dd.merge(df1, df2, on=['name', 'age', 'email'], how='inner') print(df3.head())
def getClustersIndex(clusters, users_genres): clusters = dd.from_dask_array(clusters, ) clusters = clusters.reset_index().rename(columns={0: 'cluster'}) users_genres = users_genres.reset_index() clusters_index = dd.merge(users_genres, clusters, left_index=True, right_on='index') return clusters_index[['userId', 'cluster']]
def test_join(how, left, right, df1, df2): expr = left.join(right, left.key == right.key, how=how)[left, right.other_value, right.key3] result = expr.compile() expected = dd.merge(df1, df2, how=how, on='key') tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def execute_grouped_window_op( op, data, window, scope, timecontext, aggcontext, clients, **kwargs, ): # extract the parent (root, ) = op.root_tables() root_expr = root.to_expr() root_data = execute( root_expr, scope=scope, timecontext=timecontext, clients=clients, aggcontext=aggcontext, **kwargs, ) group_by = window._group_by grouping_keys = [ key_op.name for key_op in map(operator.methodcaller('op'), group_by) ] grouped_root_data = root_data.groupby(grouping_keys) scope = scope.merge_scopes( [ Scope({t: grouped_root_data}, timecontext) for t in op.expr.op().root_tables() ], overwrite=True, ) result = execute_with_scope( expr=op.expr, scope=scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) # If the grouped operation we performed is not an analytic UDF we have to # realign the output to the input. if not isinstance(op.expr._arg, ops.AnalyticVectorizedUDF): result = dd.merge( root_data[result.index.name].to_frame(), result.to_frame(), left_on=result.index.name, right_index=True, )[result.name] result.divisions = root_data.divisions return result
def test_join_project_left_table(how, left, right, df1, df2): expr = left.join(right, left.key == right.key, how=how)[left, right.key3] result = expr.compile() expected = dd.merge(df1, df2, how=how, on='key')[list(left.columns) + ['key3']] tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def mergeCSV(csv1, csv2, csv1_key, csv2_key, type_join): #joins two csv tables (read from files) based on some specified keys (csv1_key for file csv1 and csv2_key for file csv2). type_join specifies the type of join to perform table1 = dd.read_csv(csv1, low_memory=False) table2 = dd.read_csv(csv2, low_memory=False) merged_table = dd.merge(table1, table2, left_on=csv1_key, right_on=csv2_key, how=type_join) return merged_table
def test_cross_join_project_left_table(left, right, df1, df2): expr = left.cross_join(right)[left, right.key3] result = expr.compile() expected = dd.merge( df1.assign(dummy=1), df2.assign(dummy=1), how='inner', on='dummy' ).rename(columns={'key_x': 'key'})[list(left.columns) + ['key3']] tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def write_out_combined_data(): state_id = STATE households = dd.read_csv( r'output/state_{}_puma_*_households.csv'.format(state_id)) people = dd.read_csv(r'output/state_{}_puma_*_people.csv'.format(state_id)) combined = dd.merge(people, households, on=[inputs.HOUSEHOLD_ID.name]) cdf = combined.compute() cdf.sort_values('household_id', axis=0, inplace=True) cdf.loc[:, 'num_people'] = cdf.num_people.replace('4+', 4).astype(int) cdf.to_csv(r'output/state_{}_combined_data_full.csv'.format(STATE))
def test_join_with_post_expression_selection(how, left, right, df1, df2): join = left.join(right, left.key == right.key, how=how) expr = join[left.key, left.value, right.other_value] result = expr.compile() expected = dd.merge(df1, df2, on='key', how=how)[['key', 'value', 'other_value']] tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_merge_by_multiple_columns(): pdf1l = pd.DataFrame({'a': list('abcdefghij'), 'b': list('abcdefghij'), 'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list('abcdefghij')) pdf1r = pd.DataFrame({'d': list('abcdefghij'), 'e': list('abcdefghij'), 'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefghij')) pdf2l = pd.DataFrame({'a': list('abcdeabcde'), 'b': list('abcabcabca'), 'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list('abcdefghij')) pdf2r = pd.DataFrame({'d': list('edcbaedcba'), 'e': list('aaabbbcccd'), 'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list('fghijklmno')) pdf3l = pd.DataFrame({'a': list('aaaaaaaaaa'), 'b': list('aaaaaaaaaa'), 'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list('abcdefghij')) pdf3r = pd.DataFrame({'d': list('aaabbbccaa'), 'e': list('abbbbbbbbb'), 'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list('ABCDEFGHIJ')) for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]: for lpart, rpart in [(2, 2), (3, 2), (2, 3)]: ddl = dd.from_pandas(pdl, lpart) ddr = dd.from_pandas(pdr, rpart) for how in ['inner', 'outer', 'left', 'right']: eq(ddl.join(ddr, how=how), pdl.join(pdr, how=how)) eq(ddr.join(ddl, how=how), pdr.join(pdl, how=how)) eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True), pd.merge(pdl, pdr, how=how, left_index=True, right_index=True)) eq(dd.merge(ddr, ddl, how=how, left_index=True, right_index=True), pd.merge(pdr, pdl, how=how, left_index=True, right_index=True)) # hash join list_eq(dd.merge(ddl, ddr, how=how, left_on='a', right_on='d'), pd.merge(pdl, pdr, how=how, left_on='a', right_on='d')) list_eq(dd.merge(ddl, ddr, how=how, left_on='b', right_on='e'), pd.merge(pdl, pdr, how=how, left_on='b', right_on='e')) list_eq(dd.merge(ddr, ddl, how=how, left_on='d', right_on='a'), pd.merge(pdr, pdl, how=how, left_on='d', right_on='a')) list_eq(dd.merge(ddr, ddl, how=how, left_on='e', right_on='b'), pd.merge(pdr, pdl, how=how, left_on='e', right_on='b')) list_eq(dd.merge(ddl, ddr, how=how, left_on=['a', 'b'], right_on=['d', 'e']), pd.merge(pdl, pdr, how=how, left_on=['a', 'b'], right_on=['d', 'e']))
def _find_objects(ndim, df1, df2): """Main utility function for find_objects.""" meta = dd.utils.make_meta([(i, object) for i in range(ndim)]) if isinstance(df1, Delayed): df1 = dd.from_delayed(df1, meta=meta) if isinstance(df2, Delayed): df2 = dd.from_delayed(df2, meta=meta) ddf = dd.merge(df1, df2, how="outer", left_index=True, right_index=True) result = ddf.apply(_merge_bounding_boxes, ndim=ndim, axis=1, meta=meta) return result
def test_merge_by_multiple_columns(how): pdf1l = pd.DataFrame({'a': list('abcdefghij'), 'b': list('abcdefghij'), 'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list('abcdefghij')) pdf1r = pd.DataFrame({'d': list('abcdefghij'), 'e': list('abcdefghij'), 'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefghij')) pdf2l = pd.DataFrame({'a': list('abcdeabcde'), 'b': list('abcabcabca'), 'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list('abcdefghij')) pdf2r = pd.DataFrame({'d': list('edcbaedcba'), 'e': list('aaabbbcccd'), 'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list('fghijklmno')) pdf3l = pd.DataFrame({'a': list('aaaaaaaaaa'), 'b': list('aaaaaaaaaa'), 'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list('abcdefghij')) pdf3r = pd.DataFrame({'d': list('aaabbbccaa'), 'e': list('abbbbbbbbb'), 'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list('ABCDEFGHIJ')) for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]: for lpart, rpart in [(2, 2), (3, 2), (2, 3)]: ddl = dd.from_pandas(pdl, lpart) ddr = dd.from_pandas(pdr, rpart) eq(ddl.join(ddr, how=how), pdl.join(pdr, how=how)) eq(ddr.join(ddl, how=how), pdr.join(pdl, how=how)) eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True), pd.merge(pdl, pdr, how=how, left_index=True, right_index=True)) eq(dd.merge(ddr, ddl, how=how, left_index=True, right_index=True), pd.merge(pdr, pdl, how=how, left_index=True, right_index=True)) # hash join list_eq(dd.merge(ddl, ddr, how=how, left_on='a', right_on='d'), pd.merge(pdl, pdr, how=how, left_on='a', right_on='d')) list_eq(dd.merge(ddl, ddr, how=how, left_on='b', right_on='e'), pd.merge(pdl, pdr, how=how, left_on='b', right_on='e')) list_eq(dd.merge(ddr, ddl, how=how, left_on='d', right_on='a'), pd.merge(pdr, pdl, how=how, left_on='d', right_on='a')) list_eq(dd.merge(ddr, ddl, how=how, left_on='e', right_on='b'), pd.merge(pdr, pdl, how=how, left_on='e', right_on='b')) list_eq(dd.merge(ddl, ddr, how=how, left_on=['a', 'b'], right_on=['d', 'e']), pd.merge(pdl, pdr, how=how, left_on=['a', 'b'], right_on=['d', 'e']))
def test_cross_join(left, right, df1, df2): expr = left.cross_join(right)[left, right.other_value, right.key3] result = expr.compile() expected = dd.merge( df1.assign(dummy=1), df2.assign(dummy=1), how='inner', on='dummy' ).rename(columns={'key_x': 'key'}) del expected['dummy'], expected['key_y'] tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def test_join_with_multiple_predicates(how, left, right, df1, df2): expr = left.join( right, [left.key == right.key, left.key2 == right.key3], how=how )[left, right.key3, right.other_value] result = expr.compile() expected = dd.merge( df1, df2, how=how, left_on=['key', 'key2'], right_on=['key', 'key3'] ).reset_index(drop=True) tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def merge_columns(df1, df2, cols): """ Function to merge datasets into one based on their common column df1 = first dataframe df2 = second dataframe cols = column or columns to merge on we want to check and remove the Unnamed:0 column """ data = dd.merge(df1, df2, on=cols) if 'Unnamed: 0' in data: data = data.drop('Unnamed: 0', axis=1) return data
def combine_vitals_stays(df_1, df_2): """combine the dataframes""" if not os.path.exists('data/vitals_stays.csv'): vitals_stays = dd.merge(df_1, df_2, how='inner', on=['SUBJECT_ID', 'HADM_ID']) vitals_stays = vitals_stays.compute() vitals_stays.to_csv('data/vitals_stays.csv', index=False) return dd.read_csv('data/vitals_stays.csv', parse_dates=['ADMITTIME', 'DISCHTIME'])
def add_y_sum_counts(df, cfg): meta = pd.Series( [], name="y_sum_total", index=pd.Index([], name="tax_id", dtype=int), dtype=int, ) ds = df.groupby("tax_id").apply(compute_y_sum_total, cfg, meta=meta) ds = ds.reset_index() df = dd.merge(df, ds, on=["tax_id"]) return df
def test_half_indexed_dataframe_avoids_shuffle(): a = pd.DataFrame({"x": np.random.randint(100, size=1000)}) b = pd.DataFrame({"y": np.random.randint(100, size=100)}, index=np.random.randint(100, size=100)) aa = dd.from_pandas(a, npartitions=100) bb = dd.from_pandas(b, npartitions=2) c = pd.merge(a, b, left_index=True, right_on="y") cc = dd.merge(aa, bb, left_index=True, right_on="y", shuffle="tasks") list_eq(c, cc) assert len(cc.dask) < 500
def filterdata_long_dask(inputdf, threshold=None, nr_of_partitions=None): #this function was implemented with help of Jose A. Jimenez #https://stackoverflow.com/questions/62957110/pandas-selecting-multiple-rows-based-on-column-pair/ import dask.dataframe as dd initialmean = inputdf.loc[inputdf["timepoint"] == 0].mean().array[-1] initialsd = inputdf.loc[inputdf["timepoint"] == 0].std().array[-1] if threshold is None: threshold = initialmean + initialsd pre_activated_t0 = inputdf[(inputdf['timepoint'] == 0) & (inputdf['value'] > threshold)] if threshold is not None: pre_activated_t0 = inputdf[(inputdf['timepoint'] == 0) & (inputdf['value'] > threshold)] pre_activated = inputdf.merge(pre_activated_t0[["measurement", "roi"]], how="inner", on=["measurement", "roi"]) if nr_of_partitions is None: nr_of_partitions = 30 input_dd = dd.from_pandas(inputdf, npartitions=nr_of_partitions) preactivated_dd = dd.from_pandas(pre_activated, npartitions=nr_of_partitions) merger = dd.merge(input_dd, preactivated_dd, how="left", on=["timepoint", "measurement", "roi", "value"]) filtereddf = merger.compute() filtereddf = filtereddf[pd.isna(filtereddf["group_y"])] filtereddf.drop("group_y", axis=1, inplace=True) filtereddf.columns = list(inputdf.columns) length_input = len(inputdf[inputdf["timepoint"] == 0]) length_filtered = len(filtereddf[filtereddf["timepoint"] == 0]) delta = length_input - length_filtered print('Initital Mean: ' + str(initialmean) + '. Initial SD: ' + str(initialsd)) print('Threshold: ' + str(threshold)) print('Dataframe was filtered') print('Total cells: ' + str(length_input)) print(str(delta) + ' cells were removed') print('\n') return filtereddf, pre_activated
def reading_data(diffuse,data_size1): # Import data in h5py gammas = h5.File("../../data/3_gen/gammas.hdf5","r") # Converting to pandas gamma_array_df = pd.DataFrame(data=dict(gammas['array_events'])) gamma_runs_df = pd.DataFrame(data=dict(gammas['runs'])) gamma_telescope_df = pd.DataFrame(data=dict(gammas['telescope_events'])) gamma_array_dd = dd.from_pandas(gamma_array_df,chunksize=1000000) gamma_telescope_dd = dd.from_pandas(gamma_telescope_df,chunksize=1000000) #merging of array and telescope data and shuffle of proton and gamma gamma_merge = dd.merge(gamma_telescope_dd,gamma_array_dd) #there are some nan in width the needed to be deleted gamma_merge = gamma_merge.dropna() max_size = gamma_merge.shape[0] if(data_size1 < 0): data_size = max_size-1 else: data_size = data_size1 data = gamma_merge[:data_size] import IPython; IPython.embed() if(diffuse): gammas_diffuse = h5.File("../data/3_gen/gammas_diffuse.hdf5","r") gamma_diffuse_array_df = pd.DataFrame(data=dict(gammas_diffuse['array_events'])) gamma_diffuse_runs_df = pd.DataFrame(data=dict(gammas_diffuse['runs'])) gamma_diffuse_telescope_df = pd.DataFrame(data=dict(gammas_diffuse['telescope_events'])) max_size_diffuse = gamma_diffuse_telescope_df.shape[0] if(data_size1 < 0): data_size = max_size_diffuse-1 else: data_size = data_size1 gamma_diffuse_array_df = gamma_diffuse_array_df.iloc[:data_size] gamma_diffuse_runs_df = gamma_diffuse_runs_df.iloc[:data_size] gamma_diffuse_telescope_df = gamma_diffuse_telescope_df.iloc[:data_size] gamma_diffuse_merge = pd.merge(gamma_diffuse_array_df,gamma_diffuse_telescope_df,on=list(['array_event_id','run_id'])) gamma_diffuse_merge = gamma_diffuse_merge.set_index(['run_id','array_event_id']) gamma_diffuse_merge = gamma_diffuse_merge.dropna(axis=0) gamma_diffuse_merge = gamma_diffuse_merge.reset_index() gamma_merge = gamma_merge.reset_index() data = pd.concat([gamma_merge,gamma_diffuse_merge]) data = data.set_index(['run_id','array_event_id']) data = data.dropna(axis=1) print("Using diffused data...") return data;
def test_half_indexed_dataframe_avoids_shuffle(): a = pd.DataFrame({'x': np.random.randint(100, size=1000)}) b = pd.DataFrame({'y': np.random.randint(100, size=100)}, index=np.random.randint(100, size=100)) aa = dd.from_pandas(a, npartitions=100) bb = dd.from_pandas(b, npartitions=2) c = pd.merge(a, b, left_index=True, right_on='y') cc = dd.merge(aa, bb, left_index=True, right_on='y', shuffle='tasks') list_eq(c, cc) assert len(cc.dask) < 500
def test_merge_maintains_columns(): lhs = pd.DataFrame({'A': [1, 2, 3], 'B': list('abc'), 'C': 'foo', 'D': 1.0}, columns=list('DCBA')) rhs = pd.DataFrame({'G': [4, 5], 'H': 6.0, 'I': 'bar', 'B': list('ab')}, columns=list('GHIB')) ddf = dd.from_pandas(lhs, npartitions=1) merged = dd.merge(ddf, rhs, on='B').compute() assert tuple(merged.columns) == ('D', 'C', 'B', 'A', 'G', 'H', 'I')
def gen_is_last(train_data): train_data_2 = train_data.sort_values(by=["user_id", "context_timestamp"], ascending=False) last = train_data_2.drop_duplicates(["user_id"]) last['is_last'] = 1 last = last[["user_id", "context_timestamp", "is_last"]] train_data = dd.merge(train_data, last, how="left", on=["user_id", "context_timestamp"]) train_data = train_data.fillna({"is_last": 0}) last = last.rename(columns={"context_timestamp": "is_last_time_gap"})[[ "user_id", "is_last_time_gap" ]] train_data = dd.merge(train_data, last, on=["user_id"], how="left") train_data["is_last_time_gap"] = ( train_data["is_last_time_gap"] - train_data["context_timestamp"]).dt.total_seconds() train_data["is_last_time_gap"] = train_data["is_last_time_gap"].astype( np.int32) train_data['is_last'] = train_data['is_last'].astype(np.int32) del train_data_2, last return train_data
def test_join_with_project_right_duplicate_column(client, how, left, df1, df3): # also test that the order of operands in the predicate doesn't matter right = client.table('df3') join = left.join(right, ['key'], how=how) expr = join[left.key, right.key2, right.other_value] result = expr.compile() expected = (dd.merge(df1, df3, on='key', how=how).drop( ['key2_x', 'key3', 'value'], axis=1).rename(columns={'key2_y': 'key2'})) tm.assert_frame_equal( result[expected.columns].compute(scheduler='single-threaded'), expected.compute(scheduler='single-threaded'), )
def add_diagnosis(vitals_stays): """Add diagnosis to dataset""" admission = dd.read_csv('data/ADMISSIONS.csv') diagnosis = dd.read_csv('data/DIAGNOSES_ICD.csv') # combine the data frames admission_diag = dd.merge(admission, diagnosis, on=['SUBJECT_ID', 'HADM_ID'], how='outer') admission_diag = admission_diag.compute() # mask for only the patients in our data admission_diag = admission_diag[admission_diag.HADM_ID.isin( vitals_stays.HADM_ID.compute().values)] # convert icd9 codes e_mask = admission_diag.ICD9_CODE.str.startswith('E') # starts with 'E' and longer than 4 admission_diag.loc[e_mask, 'ICD9_CODE'] = admission_diag.loc[e_mask, 'ICD9_CODE'].str[:4] # doesn't start with 'E' and longer than 3 admission_diag.loc[~e_mask, 'ICD9_CODE'] = admission_diag.loc[~e_mask, 'ICD9_CODE'].str[:3] # use crosstab to convert to binary matrix admission_diag = admission_diag[['HADM_ID', 'ICD9_CODE']] admission_diag = np.clip( pd.crosstab(admission_diag.HADM_ID, admission_diag.ICD9_CODE), 0, 1) admission_diag['HADM_ID'] = admission_diag.index final_df = dd.merge(vitals_stays, admission_diag, on='HADM_ID') return final_df.compute()
def constructDictFromCSVFiles(csv1,csv2,csv1_key,csv2_key,object_name,attribute_name,type_join): if csv1!=csv2 or type_join.lower()!='none': print('2 different csvs') table1=dd.read_csv(csv1,header=0,usecols=[csv1_key,object_name],low_memory=False) table2=dd.read_csv(csv2,header=0,usecols=[csv2_key,attribute_name],low_memory=False) merged_table=dd.merge(table1,table2,left_on=csv1_key,right_on=csv2_key,how=type_join) else: print('no joins') merged_table=dd.read_csv(csv1,header=0,usecols=[csv1_key,object_name,attribute_name],low_memory=False) dico=collections.defaultdict(list) for row in merged_table.itertuples(): key=getattr(row,object_name) att=getattr(row,attribute_name) dico[key].append(att) return dico
def test_merge_maintains_columns(): lhs = pd.DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": "foo", "D": 1.0}, columns=list("DCBA")) rhs = pd.DataFrame({"G": [4, 5], "H": 6.0, "I": "bar", "B": list("ab")}, columns=list("GHIB")) ddf = dd.from_pandas(lhs, npartitions=1) merged = dd.merge(ddf, rhs, on="B").compute() assert tuple(merged.columns) == ("D", "C", "B", "A", "G", "H", "I")
def test_merge_maintains_columns(lhs, rhs): ddf = dd.from_pandas(lhs, npartitions=1) merged = dd.merge(ddf, rhs, on='B').compute() assert tuple(merged.columns) == ('D', 'C', 'B', 'A', 'G', 'H', 'I')
import glob import numpy as np import pandas as pd import os import dask.dataframe as dd repeats = dd.read_csv("repeats_hg19.csv") anno = dd.read_table("RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.CATGAC.dan.anno") df1 = dd.merge(anno, repeats, on="chr", how="outer", suffixes=("","_repeat")) df1.to_csv("find_repeatsTESToutput.csv", index=False) df1 = df1[(repeats.chr == row.chr) & (anno.start >= repeats.begin) & (anno.start <= repeats.end)] df1 = dd.merge(anno, df1, on = ["chr"]) df1.to_csv("find_repeatsTEST2.csv", index=False).compute(num_workers=20)
def test_merge_by_index_patterns(how): pdf1l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}) pdf1r = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1]}) pdf2l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) pdf2r = pd.DataFrame({'c': [7, 6, 5, 4, 3, 2, 1], 'd': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) pdf3l = pdf2l pdf3r = pd.DataFrame({'c': [6, 7, 8, 9], 'd': [5, 4, 3, 2]}, index=list('abdg')) pdf4l = pdf2l pdf4r = pd.DataFrame({'c': [9, 10, 11, 12], 'd': [5, 4, 3, 2]}, index=list('abdg')) # completely different index pdf5l = pd.DataFrame({'a': [1, 1, 2, 2, 3, 3, 4], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('lmnopqr')) pdf5r = pd.DataFrame({'c': [1, 1, 1, 1], 'd': [5, 4, 3, 2]}, index=list('abcd')) pdf6l = pd.DataFrame({'a': [1, 1, 2, 2, 3, 3, 4], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('cdefghi')) pdf6r = pd.DataFrame({'c': [1, 2, 1, 2], 'd': [5, 4, 3, 2]}, index=list('abcd')) pdf7l = pd.DataFrame({'a': [1, 1, 2, 2, 3, 3, 4], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) pdf7r = pd.DataFrame({'c': [5, 6, 7, 8], 'd': [5, 4, 3, 2]}, index=list('fghi')) for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r), (pdf4l, pdf4r), (pdf5l, pdf5r), (pdf6l, pdf6r), (pdf7l, pdf7r)]: for lpart, rpart in [(2, 2), # same partition (3, 2), # left npartition > right npartition (2, 3)]: # left npartition < right npartition ddl = dd.from_pandas(pdl, lpart) ddr = dd.from_pandas(pdr, rpart) eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True), pd.merge(pdl, pdr, how=how, left_index=True, right_index=True)) eq(dd.merge(ddr, ddl, how=how, left_index=True, right_index=True), pd.merge(pdr, pdl, how=how, left_index=True, right_index=True)) eq(ddr.merge(ddl, how=how, left_index=True, right_index=True), pdr.merge(pdl, how=how, left_index=True, right_index=True)) eq(ddl.merge(ddr, how=how, left_index=True, right_index=True), pdl.merge(pdr, how=how, left_index=True, right_index=True)) # hash join list_eq(dd.merge(ddl, ddr, how=how, left_on='a', right_on='c'), pd.merge(pdl, pdr, how=how, left_on='a', right_on='c')) list_eq(dd.merge(ddl, ddr, how=how, left_on='b', right_on='d'), pd.merge(pdl, pdr, how=how, left_on='b', right_on='d')) list_eq(dd.merge(ddr, ddl, how=how, left_on='c', right_on='a'), pd.merge(pdr, pdl, how=how, left_on='c', right_on='a')) list_eq(dd.merge(ddr, ddl, how=how, left_on='d', right_on='b'), pd.merge(pdr, pdl, how=how, left_on='d', right_on='b')) list_eq(ddl.merge(ddr, how=how, left_on='a', right_on='c'), pdl.merge(pdr, how=how, left_on='a', right_on='c')) list_eq(ddl.merge(ddr, how=how, left_on='b', right_on='d'), pdl.merge(pdr, how=how, left_on='b', right_on='d')) list_eq(ddr.merge(ddl, how=how, left_on='c', right_on='a'), pdr.merge(pdl, how=how, left_on='c', right_on='a')) list_eq(ddr.merge(ddl, how=how, left_on='d', right_on='b'), pdr.merge(pdl, how=how, left_on='d', right_on='b'))
def test_merge(how): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) eq(dd.merge(a, b, left_index=True, right_index=True), pd.merge(A, B, left_index=True, right_index=True)) result = dd.merge(a, b, on='y', how=how) list_eq(result, pd.merge(A, B, on='y', how=how)) assert all(d is None for d in result.divisions) list_eq(dd.merge(a, b, left_on='x', right_on='z', how=how), pd.merge(A, B, left_on='x', right_on='z', how=how)) list_eq(dd.merge(a, b, left_on='x', right_on='z', how=how, suffixes=('1', '2')), pd.merge(A, B, left_on='x', right_on='z', how=how, suffixes=('1', '2'))) list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how)) list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how)) list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how)) list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how)) list_eq(dd.merge(a, b, left_index=True, right_index=True, how=how), pd.merge(A, B, left_index=True, right_index=True, how=how)) list_eq(dd.merge(a, b, left_index=True, right_index=True, how=how, suffixes=('1', '2')), pd.merge(A, B, left_index=True, right_index=True, how=how, suffixes=('1', '2'))) list_eq(dd.merge(a, b, left_on='x', right_index=True, how=how), pd.merge(A, B, left_on='x', right_index=True, how=how)) list_eq(dd.merge(a, b, left_on='x', right_index=True, how=how, suffixes=('1', '2')), pd.merge(A, B, left_on='x', right_index=True, how=how, suffixes=('1', '2')))
def test_merge_by_multiple_columns(how, shuffle): pdf1l = pd.DataFrame( {"a": list("abcdefghij"), "b": list("abcdefghij"), "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list("abcdefghij"), ) pdf1r = pd.DataFrame( {"d": list("abcdefghij"), "e": list("abcdefghij"), "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefghij"), ) pdf2l = pd.DataFrame( {"a": list("abcdeabcde"), "b": list("abcabcabca"), "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list("abcdefghij"), ) pdf2r = pd.DataFrame( {"d": list("edcbaedcba"), "e": list("aaabbbcccd"), "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list("fghijklmno"), ) pdf3l = pd.DataFrame( {"a": list("aaaaaaaaaa"), "b": list("aaaaaaaaaa"), "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list("abcdefghij"), ) pdf3r = pd.DataFrame( {"d": list("aaabbbccaa"), "e": list("abbbbbbbbb"), "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list("ABCDEFGHIJ"), ) for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]: for lpart, rpart in [(2, 2), (3, 2), (2, 3)]: ddl = dd.from_pandas(pdl, lpart) ddr = dd.from_pandas(pdr, rpart) eq(ddl.join(ddr, how=how, shuffle=shuffle), pdl.join(pdr, how=how)) eq(ddr.join(ddl, how=how, shuffle=shuffle), pdr.join(pdl, how=how)) eq( dd.merge(ddl, ddr, how=how, left_index=True, right_index=True, shuffle=shuffle), pd.merge(pdl, pdr, how=how, left_index=True, right_index=True), ) eq( dd.merge(ddr, ddl, how=how, left_index=True, right_index=True, shuffle=shuffle), pd.merge(pdr, pdl, how=how, left_index=True, right_index=True), ) # hash join list_eq( dd.merge(ddl, ddr, how=how, left_on="a", right_on="d", shuffle=shuffle), pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"), ) list_eq( dd.merge(ddl, ddr, how=how, left_on="b", right_on="e", shuffle=shuffle), pd.merge(pdl, pdr, how=how, left_on="b", right_on="e"), ) list_eq( dd.merge(ddr, ddl, how=how, left_on="d", right_on="a", shuffle=shuffle), pd.merge(pdr, pdl, how=how, left_on="d", right_on="a"), ) list_eq( dd.merge(ddr, ddl, how=how, left_on="e", right_on="b", shuffle=shuffle), pd.merge(pdr, pdl, how=how, left_on="e", right_on="b"), ) list_eq( dd.merge(ddl, ddr, how=how, left_on=["a", "b"], right_on=["d", "e"], shuffle=shuffle), pd.merge(pdl, pdr, how=how, left_on=["a", "b"], right_on=["d", "e"]), )
def test_merge_maintains_columns(lhs, rhs): ddf = dd.from_pandas(lhs, npartitions=1) merged = dd.merge(ddf, rhs, on="B").compute() assert tuple(merged.columns) == ("D", "C", "B", "A", "G", "H", "I")
def test_merge_by_index_patterns(how, shuffle): pdf1l = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}) pdf1r = pd.DataFrame({"c": [1, 2, 3, 4, 5, 6, 7], "d": [7, 6, 5, 4, 3, 2, 1]}) pdf2l = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefg")) pdf2r = pd.DataFrame({"c": [7, 6, 5, 4, 3, 2, 1], "d": [7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefg")) pdf3l = pdf2l pdf3r = pd.DataFrame({"c": [6, 7, 8, 9], "d": [5, 4, 3, 2]}, index=list("abdg")) pdf4l = pdf2l pdf4r = pd.DataFrame({"c": [9, 10, 11, 12], "d": [5, 4, 3, 2]}, index=list("abdg")) # completely different index pdf5l = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3, 4], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("lmnopqr")) pdf5r = pd.DataFrame({"c": [1, 1, 1, 1], "d": [5, 4, 3, 2]}, index=list("abcd")) pdf6l = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3, 4], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("cdefghi")) pdf6r = pd.DataFrame({"c": [1, 2, 1, 2], "d": [5, 4, 3, 2]}, index=list("abcd")) pdf7l = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3, 4], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefg")) pdf7r = pd.DataFrame({"c": [5, 6, 7, 8], "d": [5, 4, 3, 2]}, index=list("fghi")) for pdl, pdr in [ (pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r), (pdf4l, pdf4r), (pdf5l, pdf5r), (pdf6l, pdf6r), (pdf7l, pdf7r), ]: for lpart, rpart in [ (2, 2), # same partition (3, 2), # left npartition > right npartition (2, 3), ]: # left npartition < right npartition ddl = dd.from_pandas(pdl, lpart) ddr = dd.from_pandas(pdr, rpart) eq( dd.merge(ddl, ddr, how=how, left_index=True, right_index=True, shuffle=shuffle), pd.merge(pdl, pdr, how=how, left_index=True, right_index=True), ) eq( dd.merge(ddr, ddl, how=how, left_index=True, right_index=True, shuffle=shuffle), pd.merge(pdr, pdl, how=how, left_index=True, right_index=True), ) eq( ddr.merge(ddl, how=how, left_index=True, right_index=True, shuffle=shuffle), pdr.merge(pdl, how=how, left_index=True, right_index=True), ) eq( ddl.merge(ddr, how=how, left_index=True, right_index=True, shuffle=shuffle), pdl.merge(pdr, how=how, left_index=True, right_index=True), ) # hash join list_eq( dd.merge(ddl, ddr, how=how, left_on="a", right_on="c", shuffle=shuffle), pd.merge(pdl, pdr, how=how, left_on="a", right_on="c"), ) list_eq( dd.merge(ddl, ddr, how=how, left_on="b", right_on="d", shuffle=shuffle), pd.merge(pdl, pdr, how=how, left_on="b", right_on="d"), ) list_eq( dd.merge(ddr, ddl, how=how, left_on="c", right_on="a", shuffle=shuffle), pd.merge(pdr, pdl, how=how, left_on="c", right_on="a"), ) list_eq( dd.merge(ddr, ddl, how=how, left_on="d", right_on="b", shuffle=shuffle), pd.merge(pdr, pdl, how=how, left_on="d", right_on="b"), ) list_eq( ddl.merge(ddr, how=how, left_on="a", right_on="c", shuffle=shuffle), pdl.merge(pdr, how=how, left_on="a", right_on="c"), ) list_eq( ddl.merge(ddr, how=how, left_on="b", right_on="d", shuffle=shuffle), pdl.merge(pdr, how=how, left_on="b", right_on="d"), ) list_eq( ddr.merge(ddl, how=how, left_on="c", right_on="a", shuffle=shuffle), pdr.merge(pdl, how=how, left_on="c", right_on="a"), ) list_eq( ddr.merge(ddl, how=how, left_on="d", right_on="b", shuffle=shuffle), pdr.merge(pdl, how=how, left_on="d", right_on="b"), )
def test_merge(how, shuffle): A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) eq( dd.merge(a, b, left_index=True, right_index=True, shuffle=shuffle), pd.merge(A, B, left_index=True, right_index=True), ) result = dd.merge(a, b, on="y", how=how) list_eq(result, pd.merge(A, B, on="y", how=how)) assert all(d is None for d in result.divisions) list_eq( dd.merge(a, b, left_on="x", right_on="z", how=how, shuffle=shuffle), pd.merge(A, B, left_on="x", right_on="z", how=how), ) list_eq( dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2"), shuffle=shuffle), pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")), ) list_eq(dd.merge(a, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq(dd.merge(a, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq(dd.merge(A, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq(dd.merge(A, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq( dd.merge(a, b, left_index=True, right_index=True, how=how, shuffle=shuffle), pd.merge(A, B, left_index=True, right_index=True, how=how), ) list_eq( dd.merge(a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2"), shuffle=shuffle), pd.merge(A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")), ) list_eq( dd.merge(a, b, left_on="x", right_index=True, how=how, shuffle=shuffle), pd.merge(A, B, left_on="x", right_index=True, how=how), ) list_eq( dd.merge(a, b, left_on="x", right_index=True, how=how, suffixes=("1", "2"), shuffle=shuffle), pd.merge(A, B, left_on="x", right_index=True, how=how, suffixes=("1", "2")), )
def test_merge_by_index_patterns(): pdf1l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}) pdf1r = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1]}) pdf2l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) pdf2r = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) pdf3l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) pdf3r = pd.DataFrame({'c': [1, 2, 3, 4], 'd': [5, 4, 3, 2]}, index=list('abdg')) pdf4r = pd.DataFrame({'c': [1, 2, 3, 4], 'd': [5, 4, 3, 2]}, index=list('abdg')) pdf4l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) # completely different index pdf5r = pd.DataFrame({'c': [1, 2, 3, 4], 'd': [5, 4, 3, 2]}, index=list('abcd')) pdf5l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('lmnopqr')) pdf6r = pd.DataFrame({'c': [1, 2, 3, 4], 'd': [5, 4, 3, 2]}, index=list('abcd')) pdf6l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('cdefghi')) pdf7r = pd.DataFrame({'c': [1, 2, 3, 4], 'd': [5, 4, 3, 2]}, index=list('fghi')) pdf7l = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r), (pdf4l, pdf4r), (pdf5r, pdf5l), (pdf6r, pdf6l), (pdf7r, pdf7l)]: # same partition ddl = dd.from_pandas(pdl, 2) ddr = dd.from_pandas(pdr, 2) for how in ['inner', 'outer', 'left', 'right']: eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True), pd.merge(pdl, pdr, how=how, left_index=True, right_index=True)) # different partition (left npartition > right npartition) ddl = dd.from_pandas(pdl, 3) ddr = dd.from_pandas(pdr, 2) for how in ['inner', 'outer', 'left', 'right']: eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True), pd.merge(pdl, pdr, how=how, left_index=True, right_index=True)) # different partition (left npartition < right npartition) ddl = dd.from_pandas(pdl, 2) ddr = dd.from_pandas(pdr, 3) for how in ['inner', 'outer', 'left', 'right']: eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True), pd.merge(pdl, pdr, how=how, left_index=True, right_index=True))