def test_concat(join): pdf1 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[1, 2, 3, 4, 6, 7]) ddf1 = dd.from_pandas(pdf1, 2) pdf2 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[8, 9, 10, 11, 12, 13]) ddf2 = dd.from_pandas(pdf2, 2) # different columns pdf3 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'z': list('abcdef')}, index=[8, 9, 10, 11, 12, 13]) ddf3 = dd.from_pandas(pdf3, 2) for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2), (ddf1, ddf3, pdf1, pdf3)]: result = dd.concat([dd1, dd2], join=join) expected = pd.concat([pd1, pd2], join=join) assert eq(result, expected) # test outer only, inner has a problem on pandas side for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2), (ddf1, ddf3, pdf1, pdf3), (ddf1.x, ddf2.x, pdf1.x, pdf2.x), (ddf1.x, ddf3.z, pdf1.x, pdf3.z), (ddf1.x, ddf2.x, pdf1.x, pdf2.x), (ddf1.x, ddf3.z, pdf1.x, pdf3.z)]: result = dd.concat([dd1, dd2]) expected = pd.concat([pd1, pd2]) assert eq(result, expected)
def test_concat3(): pdf1 = pd.DataFrame(np.random.randn(6, 5), columns=list('ABCDE'), index=list('abcdef')) pdf2 = pd.DataFrame(np.random.randn(6, 5), columns=list('ABCFG'), index=list('ghijkl')) pdf3 = pd.DataFrame(np.random.randn(6, 5), columns=list('ABCHI'), index=list('mnopqr')) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) result = dd.concat([ddf1, ddf2]) assert result.divisions == ddf1.divisions[:-1] + ddf2.divisions assert result.npartitions == ddf1.npartitions + ddf2.npartitions assert_eq(result, pd.concat([pdf1, pdf2])) assert_eq(dd.concat([ddf1, ddf2], interleave_partitions=True), pd.concat([pdf1, pdf2])) result = dd.concat([ddf1, ddf2, ddf3]) assert result.divisions == (ddf1.divisions[:-1] + ddf2.divisions[:-1] + ddf3.divisions) assert result.npartitions == (ddf1.npartitions + ddf2.npartitions + ddf3.npartitions) assert_eq(result, pd.concat([pdf1, pdf2, pdf3])) assert_eq(dd.concat([ddf1, ddf2, ddf3], interleave_partitions=True), pd.concat([pdf1, pdf2, pdf3]))
def test_concat_unknown_divisions_errors(): a = pd.Series([1, 2, 3, 4, 5, 6]) b = pd.Series([4, 3, 2, 1]) aa = dd.from_pandas(a, npartitions=2, sort=False) bb = dd.from_pandas(b, npartitions=2, sort=False) with pytest.raises(ValueError): dd.concat([aa, bb], axis=1).compute()
def test_concat_one_series(): a = pd.Series([1, 2, 3, 4]) aa = dd.from_pandas(a, npartitions=2, sort=False) c = dd.concat([aa], axis=0) assert isinstance(c, dd.Series) c = dd.concat([aa], axis=1) assert isinstance(c, dd.DataFrame)
def test_concat_unknown_divisions(): a = pd.Series([1, 2, 3, 4]) b = pd.Series([4, 3, 2, 1]) aa = dd.from_pandas(a, npartitions=2, sort=False) bb = dd.from_pandas(b, npartitions=2, sort=False) assert not aa.known_divisions assert eq(pd.concat([a, b], axis=1), dd.concat([aa, bb], axis=1)) cc = dd.from_pandas(b, npartitions=1, sort=False) with pytest.raises(ValueError): dd.concat([aa, cc], axis=1)
def test_concat5(): pdf1 = pd.DataFrame(np.random.randn(7, 5), columns=list('ABCDE'), index=list('abcdefg')) pdf2 = pd.DataFrame(np.random.randn(7, 6), columns=list('FGHIJK'), index=list('abcdefg')) pdf3 = pd.DataFrame(np.random.randn(7, 6), columns=list('FGHIJK'), index=list('cdefghi')) pdf4 = pd.DataFrame(np.random.randn(7, 5), columns=list('FGHAB'), index=list('cdefghi')) pdf5 = pd.DataFrame(np.random.randn(7, 5), columns=list('FGHAB'), index=list('fklmnop')) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) ddf4 = dd.from_pandas(pdf4, 2) ddf5 = dd.from_pandas(pdf5, 3) cases = [[ddf1, ddf2], [ddf1, ddf3], [ddf1, ddf4], [ddf1, ddf5], [ddf3, ddf4], [ddf3, ddf5], [ddf5, ddf1, ddf4], [ddf5, ddf3], [ddf1.A, ddf4.A], [ddf2.F, ddf3.F], [ddf4.A, ddf5.A], [ddf1.A, ddf4.F], [ddf2.F, ddf3.H], [ddf4.A, ddf5.B], [ddf1, ddf4.A], [ddf3.F, ddf2], [ddf5, ddf1.A, ddf2]] for case in cases: pdcase = [c.compute() for c in case] with pytest.warns(None): # some cases will raise warning directly from pandas assert_eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert_eq(dd.concat(case, join='inner', interleave_partitions=True), pd.concat(pdcase, join='inner')) assert_eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1)) assert_eq(dd.concat(case, axis=1, join='inner'), pd.concat(pdcase, axis=1, join='inner')) # Dask + pandas cases = [[ddf1, pdf2], [ddf1, pdf3], [pdf1, ddf4], [pdf1.A, ddf4.A], [ddf2.F, pdf3.F], [ddf1, pdf4.A], [ddf3.F, pdf2], [ddf2, pdf1, ddf3.F]] for case in cases: pdcase = [c.compute() if isinstance(c, _Frame) else c for c in case] assert_eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert_eq(dd.concat(case, join='inner', interleave_partitions=True), pd.concat(pdcase, join='inner')) assert_eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1)) assert_eq(dd.concat(case, axis=1, join='inner'), pd.concat(pdcase, axis=1, join='inner'))
def test_union_with_list_types(t, df, distinct): expr = t.union(t, distinct=distinct) result = expr.compile() expected = ( df if distinct else dd.concat([df, df], axis=0, ignore_index=True) ) tm.assert_frame_equal(result.compute(), expected.compute())
def predict_price(total_amount, trip_distance, passenger_count): # Create a dataframe out of the three columns # and pass it to dask-xgboost, to predict # distributed X = dd.concat([total_amount, trip_distance, passenger_count], axis=1).astype("float64") return dask_xgboost.predict(client, bst, X)
def _merge_xyz(xyz_data_frames: list): """Try to merge xyz data frames.""" try: df = dd.concat(xyz_data_frames, axis=0) return df except Exception as e: raise (e)
def run(self): dsk = None if ParquetTarget( env_workaround().return_env("local_location") + "rates/" + self.instrument + "/" ).exists(): input_target = next(iter(self.input())) dsk = input_target.read() df = self.fetch() if dsk != None: dsk2 = dd.from_pandas(df, chunksize=10000) dsk = dd.concat([dsk, dsk2]) dsk = dsk.drop_duplicates() else: dsk = dd.from_pandas(df, chunksize=10000) self.output().write(dsk) if self.storage == "s3": self.s3output().write(dsk)
def to_dc( cls, input_item: InputType, table_name: str, format: str = None, persist: bool = True, **kwargs, ) -> DataContainer: """ Turn possible input descriptions or formats (e.g. dask dataframes, pandas dataframes, locations as string, hive tables) into the loaded data containers, maybe persist them to cluster memory before. """ filled_get_dask_dataframe = lambda *args: cls._get_dask_dataframe( *args, table_name=table_name, format=format, **kwargs, ) if isinstance(input_item, list): table = dd.concat([filled_get_dask_dataframe(item) for item in input_item]) else: table = filled_get_dask_dataframe(input_item) if persist: table = table.persist() return DataContainer(table.copy(), ColumnContainer(table.columns))
def _load_table(self, table): df = None for name, filepath in self._named_files.items(): filepath, ext = filepath if '://' not in filepath: filepath = os.path.join(self.root, filepath) if name != table: continue load_fn, kwargs = self._load_fn(ext) paths = self._resolve_template_vars(filepath) if self.use_dask and ext in ('csv', 'json', 'parquet', 'parq'): df = load_fn(paths, **kwargs) else: dfs = [load_fn(path, **kwargs) for path in paths] if len(dfs) <= 1: df = dfs[0] if dfs else None elif self.use_dask and hasattr(dfs[0], 'compute'): import dask.dataframe as dd df = dd.concat(dfs) else: df = pd.concat(dfs) if hasattr(df, 'persist'): df = df.persist() if df is None: tables = list(self._named_files) raise ValueError( f"Table '{table}' not found. Available tables include: {tables}." ) return df
def safe_concat(dfs: List[Union[dd.Series, dd.DataFrame]]) -> dd.DataFrame: """ Concat a list of `dd.Series` or `dd.DataFrame` objects into one DataFrame This will use `DataFrame.concat` if all pieces are the same length. Otherwise we will iterratively join. When axis=1 and divisions are unknown, Dask `DataFrame.concat` can only operate on objects with equal lengths, otherwise it will raise a ValueError in `concat_and_check`. See https://github.com/dask/dask/blob/2c2e837674895cafdb0612be81250ef2657d947e/dask/dataframe/multi.py#L907 # noqa Note - this is likely to be quite slow, but this should be hit rarely in real usage. A situtation that triggeres this slow path is aggregations where aggregations return different numbers of rows (see `test_aggregation_group_by` for a specific example). TODO - performance. """ lengths = list(map(len, dfs)) if len(set(lengths)) != 1: result = dfs[0].to_frame() for other in dfs[1:]: result = result.join(other.to_frame(), how="outer") else: result = dd.concat(dfs, axis=1) return result
def from_excel(path: Union[str, List[str]], **params) -> dd.DataFrame: """Creates a `dask.dataframe.DataFrame` from one or several excel files. Includes a "path column". Parameters ---------- path Path to files params Extra arguments passed on to `pandas.read_excel` Returns ------- df A `dask.dataframe.DataFrame` """ path_list = _get_file_paths(path) dds = [] for path_name in path_list: parts = delayed(pd.read_excel)(path_name, **params) data_frame = dd.from_delayed(parts).fillna("") data_frame[PATH_COLUMN_NAME] = path_name dds.append(data_frame) return dd.concat(dds)
def cv_cat_mapping(X_mat, cat_feat_cols=cat_feat_cols, orig_colnames=orig_colnames): """Map the OHE categorical variables to their counts within CV. """ X_df = pd.DataFrame(X_mat) # Make matrix into pd.df X_df.columns = orig_colnames # Create indices for each OHE cat variable cat_feat_1 = cat_feat_cols[0:2] cat_feat_2 = cat_feat_cols[2:5] cat_feat_3 = cat_feat_cols[5:7] cat_feat_3 = cat_feat_cols[5:7] cat_feat_4 = cat_feat_cols[7:19] cat_feat_col_list = [cat_feat_1, cat_feat_2, cat_feat_3, cat_feat_4] cat_mapping_list = [] # Map each OHE variable to their counts for cat_idx in range(len(cat_feat_col_list)): # Apply to X_df ohe_var = X_df.loc[:,cat_feat_col_list[cat_idx]] mapping_fun = mapping_fun_list[cat_idx] cat_mapping_list.append(ohe_var.apply(lambda x: mapping_fun(x), axis=1)) cat_counts_df = pd.DataFrame(cat_mapping_list).T # Put results into a df cat_counts_df.columns=['type_counts', 'time_counts', 'dose_counts', 'new_feature_counts'] X_df = dd.concat([X_df, cat_counts_df], axis=1) # Combine with X_df X_df = X_df.compute() # Return Dask to Pandas return X_df
def create_12_mon_features(joined_df, **kwargs): #ddf.to_parquet(joined_df, 'dask_create_12_mon_features.pq') testdfs = [] n_months = 12 for y in range(1, n_months + 1): tmpdf = joined_df[[ 'loan_id', 'timestamp_year', 'timestamp_month', 'delinquency_12', 'upb_12' ]] tmpdf['josh_months'] = tmpdf['timestamp_year'] * 12 + tmpdf[ 'timestamp_month'] tmpdf['josh_mody_n'] = np.floor( (tmpdf['josh_months'].astype('float64') - 24000 - y) / 12) grp = tmpdf.groupby(['loan_id', 'josh_mody_n']).agg({ 'delinquency_12': 'max', 'upb_12': 'min' }).reset_index() tmpdf['delinquency_12'] = (tmpdf['delinquency_12'] > 3).astype('int32') tmpdf['delinquency_12'] += (tmpdf['upb_12'] == 0).astype('int32') tmpdf['timestamp_year'] = np.floor( ((tmpdf['josh_mody_n'] * n_months) + 24000 + (y - 1)) / 12).astype('int16') tmpdf['timestamp_month'] = np.int8(y) tmpdf = tmpdf.drop(['josh_mody_n'], axis=1) testdfs.append(tmpdf) del (tmpdf) del (joined_df) return ddf.concat(testdfs)
def create_date_dataframe(dss, fields_shp, doa, class_col_name, class_use_dict): #Put intermediate data frames in a list field_dataframes = [] #Iterate geo-dataframe for idx, shape in fields_shp.iterrows(): # Indicate column of field ID IDLote = shape['IDLote'] # Indicate column of phenology class value class_value = shape[class_col_name] # Indicate the geometry column polygon = shape['geometry'] if not np.isnan(class_value): df = get_field_dataset(dss, polygon, class_value, doa, IDLote) df['IDLote'] = IDLote df['time'] = doa try: df['tt'] = class_use_dict[IDLote] except KeyError: df['tt'] = 'nd' field_dataframes.append(df) #Concatenate dask dataframes for all fields data = dd.concat(field_dataframes, axis=0, interleave_partitions=True) return data
def from_parquet(path: Union[str, List[str]], **params) -> dd.DataFrame: """Creates a `dd.DataFrame` from one or several parquet files. Includes a "path column". Parameters ---------- path Path to files **params Extra arguments passed on to `pandas.read_parquet` Returns ------- df A `dd.DataFrame` """ path_list = _get_file_paths(path) dds = [] for path_name in path_list: ddf = dd.read_parquet(path_name, engine="pyarrow", **params) ddf[PATH_COLUMN_NAME] = path_name dds.append(ddf) return dd.concat(dds)
def concat(dfs: List[DataframeLike], engine: Engine): if engine == Engine.PANDAS: return pd.concat(dfs, ignore_index=True, sort=False) if engine == Engine.DASK: import dask.dataframe return dask.dataframe.concat(dfs).reset_index(drop=True) if engine == Engine.CUDF: import cudf try: return cudf.concat(dfs, ignore_index=True) except TypeError as e: logger.warning( 'Failed to concat, likely due to column type issue, try converting to a string; columns' ) for df in dfs: logger.warning('df types :: %s', df.dtypes) raise e if engine == Engine.DASK: import dask.dataframe as dd return dd.concat(dfs) if engine == Engine.DASK_CUDF: import dask_cudf return dask_cudf.concat(dfs) raise NotImplementedError('Unknown engine')
def from_json(path: Union[str, List[str]], flatten: bool = False, **params) -> dd.DataFrame: """Creates a `dd.DataFrame` from one or several json files. Includes a "path column". Parameters ---------- path Path to files flatten If true, flatten nested data (default false). **params Extra arguments passed on to `pandas.read_json` Returns ------- dataframe A `dd.DataFrame` """ def json_engine(*args, **kwargs) -> pd.DataFrame: data_frame = pd.read_json(*args, **kwargs) return flatten_dataframe(data_frame) if flatten else data_frame path_list = _get_file_paths(path) dds = [] for path_name in path_list: ddf = dd.read_json(path_name, engine=json_engine, **params) ddf[PATH_COLUMN_NAME] = path_name dds.append(ddf) return dd.concat(dds)
def test_set_index_sorts(): # https://github.com/dask/dask/issues/2288 vals = np.array([1348550149000000000, 1348550149000000000, 1348558142000000000, 1348558142000000000, 1348585928000000000, 1348585928000000000, 1348600739000000000, 1348601706000000000, 1348600739000000000, 1348601706000000000, 1348614789000000000, 1348614789000000000, 1348621037000000000, 1348621038000000000, 1348621040000000000, 1348621037000000000, 1348621038000000000, 1348621040000000000, 1348637628000000000, 1348638159000000000, 1348638160000000000, 1348638159000000000, 1348638160000000000, 1348637628000000000, 1348646354000000000, 1348646354000000000, 1348659107000000000, 1348657111000000000, 1348659107000000000, 1348657111000000000, 1348672876000000000, 1348672876000000000, 1348682787000000000, 1348681985000000000, 1348682787000000000, 1348681985000000000, 1348728167000000000, 1348728167000000000, 1348730745000000000, 1348730745000000000, 1348750198000000000, 1348750198000000000, 1348750198000000000, 1348753539000000000, 1348753539000000000, 1348753539000000000, 1348754449000000000, 1348754449000000000, 1348761333000000000, 1348761554000000000, 1348761610000000000, 1348761333000000000, 1348761554000000000, 1348761610000000000, 1348782624000000000, 1348782624000000000, 1348782624000000000, 1348782624000000000]) vals = pd.to_datetime(vals, unit='ns') breaks = [10, 36, 58] dfs = [] for i in range(len(breaks)): lo = sum(breaks[:i]) hi = sum(breaks[i:i + 1]) dfs.append(pd.DataFrame({"timestamp": vals[lo:hi]}, index=range(lo, hi))) ddf = dd.concat(dfs).clear_divisions() assert ddf.set_index("timestamp").index.compute().is_monotonic is True
def test_set_index_empty_partition(): test_vals = [1, 2, 3] converters = [ int, float, str, lambda x: pd.to_datetime(x, unit='ns'), ] for conv in converters: df = pd.DataFrame([{ 'x': conv(i), 'y': i } for i in test_vals], columns=['x', 'y']) ddf = dd.concat([ dd.from_pandas(df, npartitions=1), dd.from_pandas(df[df.y > df.y.max()], npartitions=1), ]) assert any( ddf.get_partition(p).compute().empty for p in range(ddf.npartitions)) assert assert_eq(ddf.set_index('x'), df.set_index('x'))
def run(self): # The main function. Gets the script, creates graph and saves the result dsk = dd.read_parquet(env_workaround().return_env("local_location") + "trading_history/*.parquet") self.instruments = dsk["instrument"].drop_duplicates().compute() self.instruments = list(self.instruments.values)[1:] self.requires() a = self.extract(self.instruments.pop(), self.granularity) for i in self.instruments: b = self.extract(i, self.granularity) a = dd.concat([a, b], axis=1) fig, ax = plt.subplots(figsize=(12, 7)) sns_plot = sns.heatmap( a.corr(), xticklabels=a.columns, yticklabels=a.columns, annot=True, linewidths=0.3, ) fig = sns_plot.get_figure() plt.title( "Correlation of instruments in the portfolio with granularity {}". format(self.granularity)) name = "correlation" + self.granularity + ".png" if not os.path.exists(env_workaround().return_env("local_location") + "images/"): os.makedirs(env_workaround().return_env("local_location") + "images/") fig.savefig(env_workaround().return_env("local_location") + "images/" + name) self.fig = fig
def simulate_state_lines_losses(eventlookup, freq_mean, states, lines, sims): '''assembles state line level events based on the year event ''' logger = logging.getLogger(__name__) logger.info('start state lines losses') numberofevents = dd(np.random.poisson(freq_mean, sims), index=np.arange(1, sims + 1), columns=['events']) catevents = simulate_events(numberofevents, eventlookup, sims) simsevents = list(range(len(catevents))) #combinedResults = xr.DataArray(np.empty((len(states), len(lines), len(catevents), 4)),name="catevents", coords=[states['state'], lines['line'], simsevents, ["sim", "eventseq", "eventid", "rand"]], dims=['state', 'line', 'eventsim', 'data'] ) logger.info( 'start to build full array of losses, combining state lines with events' ) sim_events = dd() firstloop = True for state in states['state']: print(f'start {state}') for line in lines['line']: #combinedResults.loc[state, line] = catevents.copy() print(f'start {line}') b = catevents.copy() b['state'] = state b['line'] = line if firstloop: sim_events = b firstloop = False else: sim_events = dd.concat([sim_events, b]) #sim_events = pd.concat(a, ignore_index=True, axis=0, ) logger.info('Completed combined state lines with events') return combinedResults
def load_dataframe(self, file_resources, npartitions=None): """ Args: file_resources: npartitions: """ dfs = [] for filename, content in file_resources.items(): if ".gtf" in filename: df = read_gtf(content, npartitions=npartitions, compression="gzip") dfs.append(df) if npartitions: annotation_df = dd.concat(dfs) else: annotation_df = pd.concat(dfs) if self.remove_version_num: annotation_df["gene_id"] = annotation_df["gene_id"].str.replace( "[.].*", "", regex=True) annotation_df["transcript_id"] = annotation_df[ "transcript_id"].str.replace("[.].*", "", regex=True) return annotation_df
def fastasToDF(fastas , verbose=False, ecodDB = False): regex = re.compile('[^a-zA-Z0-9]') regexAA = re.compile('[^ARDNCEQGHILKMFPSTWYV]') DFdict={} count = 0 total = [] DDF =None for fasta in fastas: if verbose == True: print(fasta) fastaIter = SeqIO.parse(fasta, "fasta") for seq in fastaIter: seqstr = regexAA.sub('', str(seq.seq)) desc =str(seq.description) fastastr = '>'+desc+'\n'+seqstr+'\n' if desc not in total: #check for duclipcates within a folder total.append(desc) DFdict[desc] = { 'desc': desc.encode(), 'seq':seqstr, 'fasta': fastastr} if ecodDB == True: labels = ['ECOD uid','ECOD domain' , 'EOCD hierearchy string', 'ECOD pdb_range'] for i,ecodb in enumerate(seq.description.split('|')[1:]): DFdict[desc][labels[i]] = ecodb count +=1 if count % 400 == 0: df = pd.DataFrame.from_dict(DFdict, orient = 'index' ) if df is not None and len(df)>0: if DDF is None: DDF = dd.from_pandas(df , chunksize = 200) else: DDF = dd.concat([ DDF, dd.from_pandas(df , chunksize = 200) ] , interleave_partitions=True ) DFdict={} else: df = pd.DataFrame.from_dict(DFdict, orient = 'index') if df is not None and len(df)>0: if DDF is None: DDF = dd.from_pandas(df , chunksize = 200) else: DDF = dd.concat([ DDF, dd.from_pandas(df , chunksize = 200) ] , interleave_partitions=True) DFdict={} if verbose == True: print(df) return DDF
async def f(): async with Scheduler(protocol=protocol, interface='ib0', dashboard_address=':8789') as s: async with Nanny(s.address, protocol=protocol, nthreads=1, memory_limit='32GB', env={'CUDA_VISIBLE_DEVICES': '2'}, ) as w: async with Nanny(s.address, protocol=protocol,memory_limit='32gb', env={'CUDA_VISIBLE_DEVICES': '3'}, nthreads=1) as w2: async with Client(s.address, asynchronous=True) as c: with log_errors(pdb=True): # Create a simple random array #n_rows = 50000000 #n_keys = 5000000 # working!!! n_rows = 5000000 n_keys = 500000 #n_rows = 5000000 #n_keys = 2500000 chunks = n_rows // 100 left = dd.concat([ da.random.random(n_rows, chunks=chunks).to_dask_dataframe(columns='x'), da.random.randint(0, n_keys, size=n_rows, chunks=chunks).to_dask_dataframe(columns='id'),], axis=1).persist() right = dd.concat([ da.random.random(n_rows, chunks=chunks).to_dask_dataframe(columns='y'), da.random.randint(0, n_keys, size=n_rows, chunks=chunks).to_dask_dataframe(columns='id'),], axis=1).persist() gright = right.map_partitions(cudf.from_pandas) gleft = left.map_partitions(cudf.from_pandas) #print(format_bytes(await c.compute(left.size) * 8 * 2)) #print(format_bytes(await c.compute(right.size) * 8 * 2)) res = gleft.merge(gright, on=['id']) res = await res.persist() print("COMPUTING HEAD()") out = await c.compute(res.head(compute=False)) #breakpoint() print(out)
def h5_append_dummy_row( df: Union[pd.DataFrame, dd.DataFrame], freq=None, tim: Optional[Sequence[Any]] = None ) -> Union[pd.DataFrame, dd.DataFrame]: """ Add row of NaN with index value that will between one of last data and one of next data start :param df: dataframe :param freq: frequency to calc index. If logically equal to False, then will be calculated using tim :param tim: sequence having in last elements time of 2 last rows :return: appended dataframe """ if tim is not None: try: dindex = pd.Timedelta( seconds=0.5 / freq) if freq else np.abs(tim[-1] - tim[-2]) / 2 except IndexError: # only one element => we think they are seldom so use 1s dindex = pd.Timedelta(seconds=1) ind_new = [tim[-1] + dindex] else: df_index, itm = multiindex_timeindex(df.index) try: dindex = pd.Timedelta( seconds=0.5 / freq) if freq else np.abs(df_index[-1] - df_index[-2]) / 2 except (IndexError, NotImplementedError): # only one element => we think they are seldom so use 1s or NotImplemented in Dask dindex = pd.Timedelta(seconds=1) ind_new = multiindex_replace(df.index[-1:], df_index[-1:] + dindex, itm) dict_dummy = {} tip0 = None same_types = True # tries prevent fall down to object type (which is bad handled by pandas.pytables) if possible for name, field in df.dtypes.iteritems(): typ = field.type dict_dummy[name] = typ(0) if np.issubdtype( typ, np.integer) else np.NaN if np.issubdtype(typ, np.floating) else '' if same_types: if typ != tip0: if tip0 is None: tip0 = typ else: same_types = False df_dummy = pd.DataFrame( dict_dummy, columns=df.columns.values, index=ind_new, dtype=tip0 if same_types else None).rename_axis('Time') if isinstance(df, dd.DataFrame): return dd.concat( [df, df_dummy], axis=0, interleave_partitions=True) # buggish dask not always can append else: return df.append(df_dummy)
def load_dataframe(self, file_resources, npartitions=None): """ Args: file_resources: npartitions: """ go_terms = pd.read_table( file_resources["rnacentral_rfam_annotations.tsv"], low_memory=True, header=None, names=["RNAcentral id", "GO terms", "Rfams"]) go_terms["RNAcentral id"] = go_terms["RNAcentral id"].str.split( "_", expand=True, n=2)[0] gene_ids = [] for file in file_resources: if "database_mappings" in file: if npartitions: id_mapping = dd.read_table(file_resources[file], header=None, names=[ "RNAcentral id", "database", "external id", "species", "RNA type", "gene symbol" ]) else: id_mapping = pd.read_table(file_resources[file], low_memory=True, header=None, names=[ "RNAcentral id", "database", "external id", "species", "RNA type", "gene symbol" ]) gene_ids.append(id_mapping) if npartitions: gene_ids = dd.concat(gene_ids, join="inner") else: gene_ids = pd.concat(gene_ids, join="inner") gene_ids["species"] = gene_ids["species"].astype("O") if self.species is not None: gene_ids = gene_ids[gene_ids["species"] == self.species] lnc_go_terms = go_terms[go_terms["RNAcentral id"].isin( gene_ids["RNAcentral id"])].groupby("RNAcentral id")[ "GO terms"].apply(lambda x: "|".join(x.unique())) lnc_rfams = go_terms[go_terms["RNAcentral id"].isin( gene_ids["RNAcentral id"])].groupby( "RNAcentral id")["Rfams"].apply(lambda x: "|".join(x.unique())) gene_ids["GO terms"] = gene_ids["RNAcentral id"].map(lnc_go_terms) gene_ids["Rfams"] = gene_ids["RNAcentral id"].map(lnc_rfams) gene_ids = gene_ids[gene_ids["GO terms"].notnull() | gene_ids["Rfams"].notnull()] return gene_ids
def channels(self): def get_biometa(array): df = dd.from_dask_array(array.channels.data) df["Array"] = array.name return df datasets = self.bag.map(get_biometa) return datasets.fold(lambda x, y: dd.concat([x, y]))
def test_to_hdf_modes_multiple_nodes(): pytest.importorskip("tables") df = pd.DataFrame( {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0] ) # appending a single partition to existing data a = dd.from_pandas(df, 1) with tmpfile("h5") as fn: a.to_hdf(fn, "/data2") a.to_hdf(fn, "/data*", mode="a") out = dd.read_hdf(fn, "/data*") assert_eq(dd.concat([df, df]), out) # overwriting a file with a single partition a = dd.from_pandas(df, 1) with tmpfile("h5") as fn: a.to_hdf(fn, "/data2") a.to_hdf(fn, "/data*", mode="w") out = dd.read_hdf(fn, "/data*") assert_eq(df, out) # appending two partitions to existing data a = dd.from_pandas(df, 2) with tmpfile("h5") as fn: a.to_hdf(fn, "/data2") a.to_hdf(fn, "/data*", mode="a") out = dd.read_hdf(fn, "/data*") assert_eq(dd.concat([df, df]), out) # overwriting a file with two partitions a = dd.from_pandas(df, 2) with tmpfile("h5") as fn: a.to_hdf(fn, "/data2") a.to_hdf(fn, "/data*", mode="w") out = dd.read_hdf(fn, "/data*") assert_eq(df, out) # overwriting a single partition, keeping other partitions a = dd.from_pandas(df, 2) with tmpfile("h5") as fn: a.to_hdf(fn, "/data1") a.to_hdf(fn, "/data2") a.to_hdf(fn, "/data*", mode="a", append=False) out = dd.read_hdf(fn, "/data*") assert_eq(dd.concat([df, df]), out)
def par_mc_samples(df, n, reps, replace=False, random_state=8675309, chunksize=100): import dask import dask.dataframe as dd from copy import copy import pandas as pd cols = copy(df.columns) cols = cols.insert(0, 'rep') mc_samples_to_return = dd.from_pandas(pd.DataFrame().reindex(columns=cols), chunksize=chunksize) if type(df) != dask.dataframe.core.DataFrame: dd_df = dd.from_pandas(df, chunksize=chunksize) else: dd_df = df for i in range(0, reps): frac = n / dd_df.shape[0].compute() selected_samples_for_rep = dd_df.sample( frac=frac, replace=replace, random_state=random_state).compute() rep_number = [] for j in range(0, n): rep_number.append([copy(i)]) rep_number = dd.from_pandas(pd.DataFrame(rep_number), chunksize=chunksize) rep_number.columns = ['rep_number'] selected_samples_for_rep.reset_index(drop=True, inplace=True) selected_samples_for_rep = dd.from_pandas(selected_samples_for_rep, chunksize=chunksize) selected_samples_for_rep = dd.concat( [copy(rep_number), copy(selected_samples_for_rep)], axis=1, ignore_index=True, sort=False) selected_samples_for_rep.columns = cols mc_samples_to_return = dd.concat( [copy(mc_samples_to_return), copy(selected_samples_for_rep)], axis=0, sort=False) random_state += 1 mc_samples_to_return = mc_samples_to_return.groupby(by='rep') return mc_samples_to_return
def make_data(n_keys, n_rows_l, n_rows_r): left = dd.concat([ da.random.random(n_rows_l).to_dask_dataframe(columns='x'), da.random.randint(0, n_keys, size=n_rows_l).to_dask_dataframe(columns='id'), ], axis=1) right = dd.concat([ da.random.random(n_rows_r).to_dask_dataframe(columns='y'), da.random.randint(0, n_keys, size=n_rows_r).to_dask_dataframe(columns='id'), ], axis=1) gleft = left.map_partitions(cudf.from_pandas) gright = right.map_partitions(cudf.from_pandas) return gleft, gright
def join_instruments(self, inputs): print('Is dask df', type(inputs[0].instrument_trades) is dd.DataFrame) print('Is pandas df', type(inputs[0].instrument_trades) is pd.DataFrame) self.results = dd.concat([input.instrument_trades for input in inputs]).reset_index(drop=True) print('Len df', len(self.results)) self.next(self.end)
def calc(self, df_input: dd.DataFrame, df_output: dd.DataFrame, feature_set_list: List) -> dd.DataFrame: for feature_set in feature_set_list: df_output = dd.concat( [df_output, self.calc_feature_set(df_input, feature_set)], axis=1) return df_output
def concat(cls, datasets, dimensions, vdims): dataframes = [] for key, ds in datasets: data = ds.data.copy() for d, k in zip(dimensions, key): data[d.name] = k dataframes.append(data) return dd.concat(dataframes)
def test_concat2(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]})} a = dd.DataFrame(dsk, 'x', ['a', 'b'], [None, None]) dsk = {('y', 0): pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60]}), ('y', 1): pd.DataFrame({'a': [40, 50, 60], 'b': [30, 20, 10]}), ('y', 2): pd.DataFrame({'a': [70, 80, 90], 'b': [0, 0, 0]})} b = dd.DataFrame(dsk, 'y', ['a', 'b'], [None, None]) c = dd.concat([a, b]) assert c.npartitions == a.npartitions + b.npartitions assert eq(pd.concat([a.compute(), b.compute()]), c) assert dd.concat([a, b]).dask == dd.concat([a, b]).dask
def concat(cls, datasets, dimensions, vdims): import dask.dataframe as dd dataframes = [] for key, ds in datasets: data = ds.data.copy() for d, k in zip(dimensions, key): data[d.name] = k dataframes.append(data) return dd.concat(dataframes)
def _merge_two(left: Dict[str, Union[pd.DataFrame, dd.DataFrame]], right: Dict[str, Union[pd.DataFrame, dd.DataFrame]], index_col: Union[List, str], dtype: str, deep=False) -> Dict[str, pd.DataFrame]: """merge 2 ingredient data.""" if len(left) == 0: return right res_data = {} # for datapoints we use dask to help performance. if dtype == 'datapoints': res_data = dict([(k, v) for k, v in left.items()]) if deep: for k, df in right.items(): if k in left.keys(): columns = left[k].columns.values # res_data[k] = left[k].append(df[columns], interleave_partitions=True) res_data[k] = dd.concat([left[k], df[columns]], axis=0, interleave_partitions=True) res_data[k] = res_data[k].drop_duplicates(subset=index_col, keep='last') # res_data[k] = res_data[k].sort_values(by=index_col) else: res_data[k] = df else: for k, df in right.items(): res_data[k] = df # for concepts/entities, we don't need to use dask. elif dtype == 'concepts': left_df = pd.concat([x for x in left.values()], sort=False) right_df = pd.concat([x for x in right.values()], sort=False) if deep: merged = left_df.append(right_df, sort=False) res = merged.groupby(by=index_col).agg(__get_last_item) res_data = {'concept': res.reset_index()} else: res_data = {'concept': right_df.drop_duplicates(subset='concept', keep='last')} res_data = res_data else: # entities if deep: for k, df in right.items(): if k in left.keys(): left[k] = left[k].append(df, ignore_index=True, sort=False) left[k] = left[k].groupby(index_col).agg(__get_last_item).reset_index() else: left[k] = df else: for k, df in right.items(): left[k] = df res_data = left return res_data
def check_and_return(ddfs, dfs, join): sol = concat(dfs, join=join) res = dd.concat(ddfs, join=join, interleave_partitions=divisions) assert_eq(res, sol) if known: parts = compute_as_if_collection(dd.DataFrame, res.dask, res.__dask_keys__()) for p in [i.iloc[:0] for i in parts]: res._meta == p # will error if schemas don't align assert not cat_index or has_known_categories(res.index) == known return res
def test_concat2(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]})} meta = make_meta({'a': 'i8', 'b': 'i8'}) a = dd.DataFrame(dsk, 'x', meta, [None, None]) dsk = {('y', 0): pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60]}), ('y', 1): pd.DataFrame({'a': [40, 50, 60], 'b': [30, 20, 10]}), ('y', 2): pd.DataFrame({'a': [70, 80, 90], 'b': [0, 0, 0]})} b = dd.DataFrame(dsk, 'y', meta, [None, None]) dsk = {('y', 0): pd.DataFrame({'b': [10, 20, 30], 'c': [40, 50, 60]}), ('y', 1): pd.DataFrame({'b': [40, 50, 60], 'c': [30, 20, 10]})} meta = make_meta({'b': 'i8', 'c': 'i8'}) c = dd.DataFrame(dsk, 'y', meta, [None, None]) dsk = {('y', 0): pd.DataFrame({'b': [10, 20, 30], 'c': [40, 50, 60], 'd': [70, 80, 90]}), ('y', 1): pd.DataFrame({'b': [40, 50, 60], 'c': [30, 20, 10], 'd': [90, 80, 70]}, index=[3, 4, 5])} meta = make_meta({'b': 'i8', 'c': 'i8', 'd': 'i8'}, index=pd.Index([], 'i8')) d = dd.DataFrame(dsk, 'y', meta, [0, 3, 5]) cases = [[a, b], [a, c], [a, d]] assert dd.concat([a]) is a for case in cases: result = dd.concat(case) pdcase = [_c.compute() for _c in case] assert result.npartitions == case[0].npartitions + case[1].npartitions assert result.divisions == (None, ) * (result.npartitions + 1) assert_eq(pd.concat(pdcase), result) assert set(result.dask) == set(dd.concat(case).dask) result = dd.concat(case, join='inner') assert result.npartitions == case[0].npartitions + case[1].npartitions assert result.divisions == (None, ) * (result.npartitions + 1) assert_eq(pd.concat(pdcase, join='inner'), result) assert set(result.dask) == set(dd.concat(case, join='inner').dask)
def test_concat4_interleave_partitions(): pdf1 = pd.DataFrame(np.random.randn(10, 5), columns=list('ABCDE'), index=list('abcdefghij')) pdf2 = pd.DataFrame(np.random.randn(13, 5), columns=list('ABCDE'), index=list('fghijklmnopqr')) pdf3 = pd.DataFrame(np.random.randn(13, 6), columns=list('CDEXYZ'), index=list('fghijklmnopqr')) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) msg = ('All inputs have known divisions which cannot be ' 'concatenated in order. Specify ' 'interleave_partitions=True to ignore order') cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1], [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]] for case in cases: pdcase = [c.compute() for c in case] with pytest.raises(ValueError) as err: dd.concat(case) assert msg in str(err.value) assert_eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert_eq(dd.concat(case, join='inner', interleave_partitions=True), pd.concat(pdcase, join='inner')) msg = "'join' must be 'inner' or 'outer'" with pytest.raises(ValueError) as err: dd.concat([ddf1, ddf1], join='invalid', interleave_partitions=True) assert msg in str(err.value)
def test_concat4_interleave_partitions(): pdf1 = pd.DataFrame(np.random.randn(10, 5), columns=list("ABCDE"), index=list("abcdefghij")) pdf2 = pd.DataFrame(np.random.randn(13, 5), columns=list("ABCDE"), index=list("fghijklmnopqr")) pdf3 = pd.DataFrame(np.random.randn(13, 6), columns=list("CDEXYZ"), index=list("fghijklmnopqr")) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) msg = ( "All inputs have known divisions which cannnot be " "concatenated in order. Specify " "interleave_partitions=True to ignore order" ) cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1], [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]] for case in cases: pdcase = [c.compute() for c in case] with tm.assertRaisesRegexp(ValueError, msg): dd.concat(case) assert eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert eq(dd.concat(case, join="inner", interleave_partitions=True), pd.concat(pdcase, join="inner")) msg = "'join' must be 'inner' or 'outer'" with tm.assertRaisesRegexp(ValueError, msg): dd.concat([ddf1, ddf1], join="invalid", interleave_partitions=True)
def test_concat2(): dsk = { ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}), ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}), } a = dd.DataFrame(dsk, "x", ["a", "b"], [None, None]) dsk = { ("y", 0): pd.DataFrame({"a": [10, 20, 30], "b": [40, 50, 60]}), ("y", 1): pd.DataFrame({"a": [40, 50, 60], "b": [30, 20, 10]}), ("y", 2): pd.DataFrame({"a": [70, 80, 90], "b": [0, 0, 0]}), } b = dd.DataFrame(dsk, "y", ["a", "b"], [None, None]) dsk = { ("y", 0): pd.DataFrame({"b": [10, 20, 30], "c": [40, 50, 60]}), ("y", 1): pd.DataFrame({"b": [40, 50, 60], "c": [30, 20, 10]}), } c = dd.DataFrame(dsk, "y", ["b", "c"], [None, None]) dsk = { ("y", 0): pd.DataFrame({"b": [10, 20, 30], "c": [40, 50, 60], "d": [70, 80, 90]}), ("y", 1): pd.DataFrame({"b": [40, 50, 60], "c": [30, 20, 10], "d": [90, 80, 70]}, index=[3, 4, 5]), } d = dd.DataFrame(dsk, "y", ["b", "c", "d"], [0, 3, 5]) cases = [[a, b], [a, c], [a, d]] assert dd.concat([a]) is a for case in cases: result = dd.concat(case) pdcase = [c.compute() for c in case] assert result.npartitions == case[0].npartitions + case[1].npartitions assert result.divisions == (None,) * (result.npartitions + 1) assert eq(pd.concat(pdcase), result) assert result.dask == dd.concat(case).dask result = dd.concat(case, join="inner") assert result.npartitions == case[0].npartitions + case[1].npartitions assert result.divisions == (None,) * (result.npartitions + 1) assert eq(pd.concat(pdcase, join="inner"), result) assert result.dask == dd.concat(case, join="inner").dask msg = "Unable to concatenate DataFrame with unknown division " "specifying axis=1" with tm.assertRaisesRegexp(ValueError, msg): dd.concat(case, axis=1)
def test_set_index_empty_partition(): test_vals = [1, 2, 3] converters = [ int, float, str, lambda x: pd.to_datetime(x, unit='ns'), ] for conv in converters: df = pd.DataFrame([{'x': conv(i), 'y': i} for i in test_vals], columns=['x', 'y']) ddf = dd.concat([ dd.from_pandas(df, npartitions=1), dd.from_pandas(df[df.y > df.y.max()], npartitions=1), ]) assert any(ddf.get_partition(p).compute().empty for p in range(ddf.npartitions)) assert assert_eq(ddf.set_index('x'), df.set_index('x'))
def test_concat_datetimeindex(): # https://github.com/dask/dask/issues/2932 b2 = pd.DataFrame({'x': ['a']}, index=pd.DatetimeIndex(['2015-03-24 00:00:16'], dtype='datetime64[ns]')) b3 = pd.DataFrame({'x': ['c']}, index=pd.DatetimeIndex(['2015-03-29 00:00:44'], dtype='datetime64[ns]')) b2['x'] = b2.x.astype('category').cat.set_categories(['a', 'c']) b3['x'] = b3.x.astype('category').cat.set_categories(['a', 'c']) db2 = dd.from_pandas(b2, 1) db3 = dd.from_pandas(b3, 1) result = concat([b2.iloc[:0], b3.iloc[:0]]) assert result.index.dtype == '<M8[ns]' result = dd.concat([db2, db3]) expected = pd.concat([b2, b3]) assert_eq(result, expected)
def test_orc_multiple(orc_files): d = read_orc(orc_files[0]) d2 = read_orc(orc_files) assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False) d2 = read_orc(os.path.dirname(orc_files[0]) + '/*.orc') assert_eq(d2[columns], dd.concat([d, d])[columns], check_index=False)
def concat(cls, columns_objs): cast_objs = cls.cast(columns_objs) return dd.concat([col.data for col in cast_objs])
def to_dd(self) -> dd.DataFrame: dfs = [] for group in self.groups: df = dd.from_dask_array(self.conn[group], columns=[group]) dfs.append(df) return dd.concat(dfs, axis=1)
def test_concat5(): pdf1 = pd.DataFrame(np.random.randn(7, 5), columns=list("ABCDE"), index=list("abcdefg")) pdf2 = pd.DataFrame(np.random.randn(7, 6), columns=list("FGHIJK"), index=list("abcdefg")) pdf3 = pd.DataFrame(np.random.randn(7, 6), columns=list("FGHIJK"), index=list("cdefghi")) pdf4 = pd.DataFrame(np.random.randn(7, 5), columns=list("FGHAB"), index=list("cdefghi")) pdf5 = pd.DataFrame(np.random.randn(7, 5), columns=list("FGHAB"), index=list("fklmnop")) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) ddf4 = dd.from_pandas(pdf4, 2) ddf5 = dd.from_pandas(pdf5, 3) cases = [ [ddf1, ddf2], [ddf1, ddf3], [ddf1, ddf4], [ddf1, ddf5], [ddf3, ddf4], [ddf3, ddf5], [ddf5, ddf1, ddf4], [ddf5, ddf3], [ddf1.A, ddf4.A], [ddf2.F, ddf3.F], [ddf4.A, ddf5.A], [ddf1.A, ddf4.F], [ddf2.F, ddf3.H], [ddf4.A, ddf5.B], [ddf1, ddf4.A], [ddf3.F, ddf2], [ddf5, ddf1.A, ddf2], ] for case in cases: pdcase = [c.compute() for c in case] assert eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert eq(dd.concat(case, join="inner", interleave_partitions=True), pd.concat(pdcase, join="inner")) assert eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1)) assert eq(dd.concat(case, axis=1, join="inner"), pd.concat(pdcase, axis=1, join="inner")) # Dask + pandas cases = [ [ddf1, pdf2], [ddf1, pdf3], [pdf1, ddf4], [pdf1.A, ddf4.A], [ddf2.F, pdf3.F], [ddf1, pdf4.A], [ddf3.F, pdf2], [ddf2, pdf1, ddf3.F], ] for case in cases: pdcase = [c.compute() if isinstance(c, _Frame) else c for c in case] assert eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert eq(dd.concat(case, join="inner", interleave_partitions=True), pd.concat(pdcase, join="inner")) assert eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1)) assert eq(dd.concat(case, axis=1, join="inner"), pd.concat(pdcase, axis=1, join="inner"))