def read_avro(urlpath, blocksize=100000000, storage_options=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import open_files, read_bytes from dask.bag import from_delayed import_required('fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} files = open_files(urlpath, **storage_options) if blocksize is not None: dhead = delayed(open_head) heads = compute(*[dhead(f) for f in files]) dread = delayed(read_chunk) bits = [] for head, f in zip(heads, files): _, chunks = read_bytes(f.path, sample=False, blocksize=blocksize, delimiter=head['sync'], include_path=False, **storage_options) bits.extend([dread(ch, head) for ch in chunks[0]]) return from_delayed(bits) else: files = open_files(urlpath, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)
def _ddfs_to_bag(data, cube): if not isinstance(data, dict): data = {cube.seed_dataset: data} ktk_cube_dataset_ids = sorted(data.keys()) bags = [] for ktk_cube_dataset_id in ktk_cube_dataset_ids: bags.append( db.from_delayed(data[ktk_cube_dataset_id].to_delayed()).map_partitions( _convert_write_bag, ktk_cube_dataset_id=ktk_cube_dataset_id ) ) return (db.concat(bags), ktk_cube_dataset_ids)
def test_from_delayed_iterator(): from dask.delayed import delayed def lazy_records(n): return ({'operations': [1, 2]} for _ in range(n)) delayed_records = delayed(lazy_records, pure=False) bag = db.from_delayed([delayed_records(5) for _ in range(5)]) assert db.compute( bag.count(), bag.pluck('operations').count(), bag.pluck('operations').concat().count(), get=dask.get, ) == (25, 25, 50)
def test_from_delayed_iterator(): from dask.delayed import delayed def lazy_records(n): return ({"operations": [1, 2]} for _ in range(n)) delayed_records = delayed(lazy_records, pure=False) bag = db.from_delayed([delayed_records(5) for _ in range(5)]) assert (db.compute( bag.count(), bag.pluck("operations").count(), bag.pluck("operations").flatten().count(), scheduler="sync", ) == (25, 25, 50))
def test_from_delayed_iterator(): from dask.delayed import delayed def lazy_records(n): return ({'operations': [1, 2]} for _ in range(n)) delayed_records = delayed(lazy_records, pure=False) bag = db.from_delayed([delayed_records(5) for _ in range(5)]) assert db.compute( bag.count(), bag.pluck('operations').count(), bag.pluck('operations').flatten().count(), get=dask.get, ) == (25, 25, 50)
def load(self, dataset, subset, key, count, start): import binascii rpc_connection = dask.delayed(ycore.create_rpc_conn)(dataset) if count == -1: count = dask.delayed(rpc_connection.liststreams)(subset)[ 0]['items'] else: count = dask.delayed(count) start = dask.delayed(start) xset = dask.delayed(rpc_connection.liststreamkeyitems)( subset, key, False, count, start) xbag = db.from_delayed(xset).pluck('data').map(binascii.unhexlify) # xbag = xbag.map(binascii.unhexlify) return xbag
def to_dask(source): chunksize = 100000 # FIXME: Where should this come from? futures = [read_partition(source, i) for i in range(source.npartitions)] if source.container == 'ndarray': # array_parts = [da.from_delayed(f, shape=c.shape, dtype=c.dtype) for f, c in zip(futures, chunks)] # return da.concatenate(array_parts, axis=0) raise ValueError('FIXME: Support ndarray concatenation') elif source.container == 'dataframe': return dd.from_delayed(futures) elif source.container == 'list': return db.from_delayed(futures) else: raise ValueError('Unknown container type: %s' % source.container)
def to_dask(source): futures = [read_partition(source, i) for i in range(source.npartitions)] if source.container == 'ndarray': # array_parts = [da.from_delayed(f, shape=c.shape, dtype=c.dtype) for f, c in zip(futures, chunks)] # return da.concatenate(array_parts, axis=0) raise ValueError('FIXME: Support ndarray concatenation') elif source.container == 'dataframe': import dask.dataframe as dd return dd.from_delayed(futures) elif source.container == 'list': import dask.bag as db return db.from_delayed(futures) else: raise ValueError('Unknown container type: %s' % source.container)
def main(files, client): bag = db.from_delayed([delayed(parse_single_file)(fn) for fn in files]) df = bag.to_dataframe(columns=columns) # Nonstandard and inconsistent date formats in input. # These two lines standardize to ISO. df['endtime'] = df['endtime'].astype(np.datetime64) df['cumul_entries'] = df.cumul_entries.astype(np.int64) df['cumul_exits'] = df.cumul_exits.astype(np.int64) df = df.repartition(npartitions=16) df.to_parquet(os.path.join(config['parquet_output_path'], 'subway.parquet'), compression='SNAPPY', object_encoding='json' )
def extract_entities_and_predicates_from_sentences( sentence_records: dbag.Bag, semrep_install_dir: Path, unicode_to_ascii_jar_path: Path, work_dir: Path, lexicon_year: int, mm_data_year: str, mm_data_version: str, ) -> dbag.Bag: """Runs each sentence through SemRep. Identifies Predicates and Entities Requires get_metamap_server_initializer added to dask_process_global. Args: sentence_records: Each record needs `id` and `sent_text`. work_dir: A directory visible to all workers where SemRep intermediate files will be stored. semrep_install_dir: The path where semrep was installed. Returns: One record per input sentence, where `id` of the new record matches the input. However, returned records will only have `entites` and `predicates` """ work_dir = Path(work_dir) assert work_dir.is_dir(), f"Failed to find shared work_dir: {work_dir}" semrep_input_dir = work_dir.joinpath("input_files") semrep_output_dir = work_dir.joinpath("output_files") semrep_input_dir.mkdir(exist_ok=True, parents=True) semrep_output_dir.mkdir(exist_ok=True, parents=True) semrep_tasks = [] for part_idx, partition in enumerate(sentence_records.to_delayed()): semrep_input_path = semrep_input_dir.joinpath(f"input_{part_idx}.txt") # semrep_output_path = semrep_output_dir.joinpath(f"ouput_{part_idx}.xml") semrep_tasks.append(dask.delayed(_sentence_partition_to_records)( records=partition, unicode_to_ascii_jar_path=unicode_to_ascii_jar_path, input_path=semrep_input_path, semrep_install_dir=semrep_install_dir, lexicon_year=lexicon_year, mm_data_year=mm_data_year, mm_data_version=mm_data_version, )) return dbag.from_delayed(semrep_tasks)
def main(files, client): bag = db.from_delayed([delayed(parse_single_file)(fn) for fn in files]) df = bag.to_dataframe(columns=columns) # Nonstandard and inconsistent date formats in input. # These two lines standardize to ISO. df['endtime'] = df['endtime'].astype(np.datetime64) df['cumul_entries'] = df.cumul_entries.astype(np.int64) df['cumul_exits'] = df.cumul_exits.astype(np.int64) df = df.repartition(npartitions=16) df.to_parquet(os.path.join(config['parquet_output_path'], 'subway.parquet'), compression='SNAPPY', object_encoding='json')
def main(): fp = '/project/euro4_hindcast/WIND-ATLAS_EURO4-RERUN/2015/06/18Z/' # fn = 'EURO4_2015060[1-3].pp' fn = '*.pp' dlyd = load_subset(fp, fn) cs = db.from_delayed(dlyd) cubes = iris.cube.CubeList(cs.compute()) # The x- and y-wind cubes are on different domains. This notwithstanding, # the x-wind cube also has one more latitude point than the y-wind cube, # which we arbitrarily chop off. x_wind_cube = cubes[0][..., :-1, :] y_wind_cube = cubes[1] wspd_cube, theta_cube = xy_to_wspd_and_dir(x_wind_cube, y_wind_cube) wspd_var_cube, wdir_var_cube = mln_variance(wspd_cube, theta_cube) print wspd_var_cube print wdir_var_cube wspd_var_data = wspd_var_cube.data wdir_var_data = wdir_var_cube.data
def get_comments(pages: int) -> DataFrame: """Get List of comments per page""" comments_delayed = (get_elements_json(url=COMMENTS_ENDPOINT, page=str(page), file="comments.dat") for page in range(1, pages + 1)) comments_bag = db.from_delayed(comments_delayed).map( lambda d: { "id": d["id"], "post": d["post"], "author_name": d["author_name"], "date": create_datetime(d["date"]), "content": d["content"]["rendered"], "link": d["link"], }) # In Windows the Dask.Bag is multiprocessing by default, change to threads with dask.config.set(scheduler="threads"): comments: List[dict] = comments_bag.compute() df_comments = DataFrame(comments) return df_comments
def read_avro(self, file_path, blocksize=1048576): """ Read the downloaded query into avro Args: file_path (str): A message from the GDAX websocket API blocksize (int): Size of blocks. Note that this size must be larger than the files' internal block size, otheriwse it will result in empty partitions. Default is 1MB :rtype: dask.delayed :returns: a dask delayed object """ from dask_avro import read_avro from dask.bag import from_delayed delayed_avro = read_avro(file_path, blocksize=blocksize) return from_delayed(delayed_avro)
def cull_empty_partitions(bag): """When bags are created by filtering or grouping from a different bag, it retains the original bag's partition count, even if a lot of the partitions become empty. Those extra partitions add overhead, so it's nice to discard them. This function drops the empty partitions. Parameters ---------- bag: dask.bag Returns ------- partitions: dask.bag """ bag = bag.persist() def get_len(partition): # If the bag is the result of bag.filter(), # then each partition is actually a 'filter' object, # which has no __len__. # In that case, we must convert it to a list first. if hasattr(partition, '__len__'): return len(partition) return len(list(partition)) partition_lengths = bag.map_partitions(get_len).compute() # Convert bag partitions into a list of 'delayed' objects lengths_and_partitions = zip(partition_lengths, bag.to_delayed()) # Drop the ones with empty partitions partitions = (p for l, p in lengths_and_partitions if l > 0) # return list of delayed objects return db.from_delayed(partitions)
def to_dask(self): import dask.bag as db from dask import delayed dfile = delayed(get_file) return db.from_delayed([dfile(f) for f in self._files])
def get_medline_documents(config: cpb.ConstructConfig, ) -> dbag.Bag: medline_dir = Path(config.medline_xml_dir) medline_dir.mkdir(parents=True, exist_ok=True) assert medline_dir.is_dir(), f"Failed to make {config.medline_xml_dir}" # Download all of pubmed. #### if not config.skip_ftp_download: print("Downloading pubmed XML Files") with ftp_util.ftp_connect( address=config.ftp.address, workdir=config.ftp.workdir, ) as conn: # Downloads new files if not already present in shared xml_paths = ftp_util.ftp_retreive_all( conn=conn, pattern="^.*\.xml\.gz$", directory=medline_dir, show_progress=True, ) if config.ftp.include_daily_updates: print("Downloading daily updates...") with ftp_util.ftp_connect( address=config.ftp.address, workdir=config.ftp.workdir_daily, ) as conn: # Downloads new files if not already present in shared xml_paths_daily = ftp_util.ftp_retreive_all( conn=conn, pattern="^.*\.xml\.gz$", directory=medline_dir, show_progress=True, ) xml_paths += xml_paths_daily else: print(f"Skipping FTP download, using {medline_dir}/*.xml.gz instead") assert medline_dir.is_dir(), f"Cannot find {medline_dir}" xml_paths = list(medline_dir.glob("*.xml.gz")) assert len(xml_paths) > 0, f"No .xml.gz files inside {medline_dir}" if config.debug.enable: print(f"\t- Downsampling {len(xml_paths)} xml files to only " f"{config.debug.partition_subset_size}.") # Takes the top x (typically larger) xml_paths = xml_paths[-config.debug.partition_subset_size:] # Parse xml-files per-partition medline_documents = dbag.from_delayed([ dask.delayed(parse_pubmed_xml.parse_zipped_pubmed_xml)(xml_path=p, ) for p in xml_paths ]) if not config.allow_nonenglish_abstracts: medline_documents = medline_documents.filter( # Only take the english ones lambda r: r["language"] == "eng") if config.HasField("cut_date"): # This will fail if the cut-date is not a valid string datetime.strptime(config.cut_date, "%Y-%m-%d") medline_documents = medline_documents.filter( lambda r: r["date"] < config.cut_date) if config.debug.enable: print("\t- Downsampling documents by " f"{config.debug.document_sample_rate}") medline_documents = medline_documents.random_sample( config.debug.document_sample_rate, ) return medline_documents
def to_dask(self): """Return a dask-bag of results""" import dask.bag as db import dask.delayed dpart = dask.delayed(self._get_partition) return db.from_delayed([dpart(i) for i in range(self.npartitions)])
def test_futures_to_delayed_bag(c): L = [1, 2, 3] futures = c.scatter([L, L]) b = db.from_delayed(futures) assert list(b) == L + L
def execute(self): self._init_service() mgr_client = self.mgr_client options = self.config["stitchedmeshes"] server, uuid, instance = self.input_service.base_service.instance_triple is_supervoxels = self.input_service.base_service.supervoxels bodies = load_body_list(options["bodies"], is_supervoxels) logger.info(f"Input is {len(bodies)} bodies") os.makedirs(options["output-directory"], exist_ok=True) def make_bricks(coord_and_block): coord_zyx, block_vol = coord_and_block logical_box = np.array((coord_zyx, coord_zyx + block_vol.shape)) return Brick(logical_box, logical_box, block_vol, location_id=(logical_box // 64)) rescale = (2**options["scale"]) * options["extra-rescale"] def create_brick_mesh(brick): mesh = Mesh.from_binary_vol(brick.volume, brick.physical_box) if rescale != 1.0: mesh.vertices_zyx *= rescale return mesh def create_combined_mesh(meshes): mesh = concatenate_meshes(meshes, False) if options["stitch"]: mesh.stitch_adjacent_faces(drop_unused_vertices=True, drop_duplicate_faces=True) mesh.laplacian_smooth(options["smoothing-iterations"]) mesh.simplify(options["decimation-fraction"], in_memory=True) return mesh in_flight = 0 # Support synchronous testing with a fake 'as_completed' object if hasattr(self.client, 'DEBUG'): result_futures = as_completed_synchronous() else: result_futures = as_completed() def pop_result(): nonlocal in_flight r = next(result_futures) in_flight -= 1 try: return r.result() except Exception as ex: if options["error-mode"] == "raise": raise body = int(r.key) return (body, 0, 'error', str(ex)) USER = getpass.getuser() results = [] try: for i, body in enumerate(bodies): logger.info(f"Mesh #{i}: Body {body}: Starting") def fetch_sparsevol(): with mgr_client.access_context(server, True, 1, 0): ns = default_node_service(server, uuid, 'flyemflows-stitchedmeshes', USER) coords_zyx, blocks = ns.get_sparselabelmask( body, instance, options["scale"], is_supervoxels) return list(coords_zyx.copy()), list(blocks.copy()) # This leaves all blocks and bricks in a single partition, # but we're about to do a shuffle anyway when the bricks are realigned. coords, blocks = delayed(fetch_sparsevol, nout=2)() coords, blocks = db.from_delayed(coords), db.from_delayed( blocks) bricks = db.zip(coords, blocks).map(make_bricks) mesh_grid = Grid((64, 64, 64), halo=options["block-halo"]) wall = BrickWall(None, (64, 64, 64), bricks) wall = wall.realign_to_new_grid(mesh_grid) brick_meshes = wall.bricks.map(create_brick_mesh) consolidated_brick_meshes = brick_meshes.repartition(1) combined_mesh = delayed(create_combined_mesh)( consolidated_brick_meshes) def write_mesh(mesh): output_dir = options["output-directory"] fmt = options["format"] output_path = f'{output_dir}/{body}.{fmt}' mesh.serialize(output_path) return (body, len(mesh.vertices_zyx), 'success', '') # We hide the body ID in the task name, so that we can record it in pop_result task = delayed(write_mesh)(combined_mesh, dask_key_name=f'{body}') result_futures.add(self.client.compute(task)) in_flight += 1 assert in_flight <= options["concurrent-bodies"] while in_flight == options["concurrent-bodies"]: body, vertices, result, msg = pop_result() if result == "error": logger.warning( f"Body {body}: Failed to generate mesh: {msg}") results.append((body, vertices, result, msg)) # Flush the last batch of tasks while in_flight > 0: body, vertices, result, msg = pop_result() if result == "error": logger.warning( f"Body {body}: Failed to generate mesh: {msg}") results.append((body, vertices, result, msg)) finally: stats_df = pd.DataFrame( results, columns=['body', 'vertices', 'result', 'msg']) stats_df.to_csv('mesh-stats.csv', index=False, header=True) failed_df = stats_df.query("result != 'success'") if len(failed_df) > 0: logger.warning( f"Failed to create meshes for {len(failed_df)} bodies. See mesh-stats.csv" )
except requests.exceptions.InvalidSchema: return [] # Skip non-web links dask.compute(crawl("http://holdenkarau.com/")) #end::mini_crawl_task[] # In[ ]: #tag::make_bag_of_crawler[] import dask.bag as db githubs = [ "https://github.com/scalingpythonml/scalingpythonml", "https://github.com/dask/distributed" ] initial_bag = db.from_delayed(map(crawl, githubs)) #end::make_bag_of_crawler[] # In[ ]: #tag::make_a_bag_of_words[] words_bag = initial_bag.map( lambda url_contents: url_contents[1].split(" ")).flatten() #end::make_a_bag_of_words[] # In[ ]: #tag::wc_freq[] dask.compute(words_bag.frequencies()) #end::wc_freq[]
def read_avro(urlpath, blocksize=100000000, storage_options=None, compression=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system compression: str or None Compression format of the targe(s), like 'gzip'. Should only be used with blocksize=None. """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import (open_files, get_fs_token_paths, OpenFile, tokenize) from dask.bag import from_delayed import_required('fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} if blocksize is not None: fs, fs_token, paths = get_fs_token_paths( urlpath, mode='rb', storage_options=storage_options) dhead = delayed(open_head) out = compute(*[dhead(fs, path, compression) for path in paths]) heads, sizes = zip(*out) dread = delayed(read_chunk) offsets = [] lengths = [] for size in sizes: off = list(range(0, size, blocksize)) length = [blocksize] * len(off) offsets.append(off) lengths.append(length) out = [] for path, offset, length, head in zip(paths, offsets, lengths, heads): delimiter = head['sync'] f = OpenFile(fs, path, compression=compression) token = tokenize(fs_token, delimiter, path, fs.ukey(path), compression, offset) keys = ['read-avro-%s-%s' % (o, token) for o in offset] values = [dread(f, o, l, head, dask_key_name=key) for o, key, l in zip(offset, keys, length)] out.extend(values) return from_delayed(out) else: files = open_files(urlpath, compression=compression, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)
def to_dask(self): import dask.bag as db import dask return db.from_delayed([dask.delayed(self._get_partition)(i) for i in range(self.npartitions)])
def to_dask(self): import dask.delayed import dask.bag as db dpart = dask.delayed(read_stream) parts = [dpart(stream) for stream in self._streams] return db.from_delayed(parts)
def get_default_rules(): """ Return the list of default rules used to interpret expressions in :func:`apply()<flowly.dsk.apply>`. Each rules has an additional property ``.name``, that may be useful when modifying the rules. """ return [ adict( name='builtins.sum', match=_match_equal(builtins.sum), apply=_methodcaller('sum'), ), adict( name='builtins.any', match=_match_equal(builtins.all), apply=_methodcaller('all'), ), adict( name='builtins.any', match=_match_equal(builtins.any), apply=_methodcaller('any'), ), adict( name='builtins.len', match=_match_equal(builtins.len), apply=_methodcaller('count'), ), adict( name='builtins.max', match=_match_equal(builtins.max), apply=_methodcaller('max'), ), adict( name='builtins.min', match=_match_equal(builtins.min), apply=_methodcaller('min'), ), adict( name='toolz.concat', match=_match_equal(toolz.concat), apply=lambda bag, transform, rules: bag.concat(), ), adict( name='toolz.compose', match=_match_isinstance(toolz.functoolz.Compose), apply=_apply__toolz__compose, ), adict( name='toolz.count', match=_match_equal(count), apply=_methodcaller('count'), ), adict( name='toolz.frequencies', match=_match_equal(frequencies), apply=_methodcaller('frequencies'), ), adict(name='toolz.curried.filter', match=_match_curried(builtins.filter), apply=lambda bag, transform, rules: bag.filter( *transform.args, **transform.keywords)), adict( name='toolz.curried.map', match=_match_curried(builtins.map), apply=lambda bag, transform, rules: bag.map( *transform.args, **transform.keywords), ), adict( name='toolz.curried.mapcat', match=_match_curried(toolz.mapcat), apply=lambda bag, transform, rules: bag.map(*transform.args). concat(), ), adict( name='toolz.curried.pluck', match=_match_curried(toolz.pluck), apply=lambda bag, transform, rules: bag.pluck( *transform.args, **transform.keywords), ), adict( name='toolz.curried.random_sample', match=_match_curried(toolz.random_sample), apply=lambda bag, transform, rules: bag.random_sample( *transform.args, **transform.keywords), ), adict(name='toolz.curried.reduce', match=_match_curried(ft.reduce), apply=lambda bag, transform, rules: bag.reduction( lambda i: i, lambda partitions: ft.reduce( transform.args[0], it.chain.from_iterable(partitions)), )), adict( name='toolz.curried.remove', match=_match_curried(toolz.remove), apply=lambda bag, transform, rules: bag.remove( *transform.args, **transform.keywords), ), adict( name='toolz.curried.take', match=_match_curried(toolz.take), apply=lambda bag, transform, rules: bag.take( *transform.args, **merge(transform.keywords, dict(compute=False, npartitions=-1) )), ), adict( name='toolz.curried.topk', match=_match_curried(toolz.topk), apply=lambda bag, transform, rules: bag.topk( *transform.args, **transform.keywords), ), adict(name='toolz.curried.groupby', match=_match_curried(toolz.groupby), apply=lambda *args, **kwargs: raise_(ValueError, 'use flowly.tz.groupby')), adict(name='toolz.curried.reduceby', match=_match_curried(toolz.reduceby), apply=lambda *args, **kwargs: raise_(ValueError, 'use flowly.tz.reduceby')), adict( name='toolz.unique', match=_match_equal(toolz.unique), apply=_methodcaller('distinct'), ), adict( name='itertools.chain.from_iterable', match=_match_equal(it.chain.from_iterable), apply=lambda bag, transform, rules: bag.concat(), ), adict( name='flowly.checkpoint.with_checkpoint', match=_match_isinstance(with_checkpoint), apply=_apply__checkpoint__with_checkpoint, ), adict( name='flowly.tz.apply_concat', match=_match_isinstance(apply_concat), apply=_apply__flowly__tz__apply_concat, ), adict( name='flowly.tz.apply_map_concat', match=_match_isinstance(apply_map_concat), apply=_apply__flowly__tz__apply_map_concat, ), adict( name='flowly.tz.build_dict', match=_match_isinstance(build_dict), apply=_build_dask_dict, ), adict( name='flowly.tz.itemsetter', match=_match_isinstance(itemsetter), apply=_update_dask_dict, ), adict( name='flowly.tz.chained', match=_match_isinstance(chained), apply=_apply__flowly__tz__chained, ), adict( name='flowly.tz.groupby', match=_match_isinstance(groupby), # TODO: inject remaining arguments into groupby apply=lambda bag, transform, rules: bag.groupby(transform.key), ), adict( name='flowlfy.tz.kv_keymap', match=_match_isinstance(kv_keymap), apply=lambda bag, transform, rules: apply( toolz.curried.map(lambda t: (transform.func(t[0]), t[1])), bag, rules=rules, ), ), adict( name='flowly.tz.kv_valmap', match=_match_isinstance(kv_valmap), apply=lambda bag, transform, rules: apply( toolz.curried.map(lambda t: (t[0], transform.func(t[1]))), bag, rules=rules, ), ), adict( name='flowly.tz.kv_reduceby', match=_match_isinstance(kv_reduceby), apply=_apply_kv_reduceby, ), adict( name='flowly.tz.kv_reductionby', match=_match_isinstance(kv_reductionby), apply=_apply_kv_reductionby, ), adict( name='flowly.tz.reduceby', match=_match_isinstance(reduceby), apply=lambda bag, transform, rules: (bag.groupby(transform.key).map(lambda t: (t[ 0], ft.reduce(transform.binop, t[1])))), ), adict( name='flowly.tz.reduction', match=_match_isinstance(reduction), apply=lambda bag, transform, rules: bag.reduction( transform.perpartition, transform.aggregate, split_every=transform.split_every, ), ), adict( name='flowly', match=_match_isinstance(reductionby), apply=_apply_reductionby, ), adict(name='flowly.tz.seq', match=_match_equal(seq), apply=lambda item, transform, rules: db.from_delayed( [item.apply(lambda i: [i]).to_delayed()])), # TODO: let any curried callable fallback to the callable itself, if not args were given # TODO: add option to skip arbitrary callables and add marker functions to annotate them adict( name='callable', match=lambda bag, transform, rules: callable(transform), apply=lambda bag, transform, rules: transform(bag), ) ]
async def run(): number_of_cores_per_node = 16 # DAS-5 features 2x8 NUMA cores per compute node reservation_length = "08:00:00" # 2 hours is more than enough... probably cluster = SLURMCluster(cores=number_of_cores_per_node, memory="64 GB", processes=4, scheduler_options={"dashboard_address": ":6868"}, local_directory="./aip-logs", interface='ib0', walltime=reservation_length) # Grab 5 execution nodes -> 80 cores print("Scaling up, getting 5 nodes") cluster.scale_up(5) client = Client(cluster) print("Client is ready, parsing data files...") file_locations = "/var/scratch/lvs215/aip_tmp" data_files = [] # Create a list of all the files we want to parse. Skip the compressed sources if they are still lingering around for path, subdirs, files in os.walk(file_locations): for name in files: if isfile(os.path.join(path, name)) and not name.endswith( ("gz", "zip", "tar")): data_files.append(os.path.join(path, name)) client.run(clear_all_files) # Create one task per file. print(data_files) print("Creating and executing tasks...") tasks = list(map(delayed(process_file), data_files)) true_false_array = db.from_delayed(tasks) # DEBUG CODE # future = client.compute(true_false_array) # client.recreate_error_locally(future) # Time to compute them! start = datetime.datetime.now() res = true_false_array.compute() end = datetime.datetime.now() print(true_false_array) print(res) print("Tasks ran to completion! Copying databases.") if False not in true_false_array: # If everything went alright, let all nodes copy their databases to the home dir. client.run(copy_database_to_home_folder) client.run(clear_all_files) else: print("Parsing one of the files went horribly wrong, quitting!") exit(-1) print("Beginning assembling of all databases into one!") # Now, each of the nodes has a local database file, we will now combine these databases into one. # We do this process sequentially, because we are not sure yet if SQLite likes it if all nodes do this in parallel. # TODO: test if we can do this procedure in each node through the copy_database_to_home_folder, would save copying data database_manager = DatabaseManager( ) # This creates an empty aip.db if it doesn't exists. con3 = database_manager.db # Reuse the connection # based on https://stackoverflow.com/a/37138506 os.makedirs(db_files_location, exist_ok=True) for file in [ os.path.join(db_files_location, f) for f in os.listdir(db_files_location) if isfile(os.path.join(db_files_location, f)) and f.endswith(".db") ]: con3.execute("ATTACH '{}' as dba".format(file)) con3.execute("BEGIN") for row in con3.execute( "SELECT * FROM dba.sqlite_master WHERE type='table'"): combine = "INSERT INTO " + row[1] + " SELECT * FROM dba." + row[1] print(combine) con3.execute(combine) con3.execute("detach database dba") con3.commit() # Now, delete the database as it has been copied. # os.remove("{}.db".format(hash(worker))) print("All done. Releasing all nodes.") await cluster.scale_down(cluster.workers) print("Nodes released.") print(end - start)
def to_dask(parts, dtype): import dask.bag as db return db.from_delayed(parts)
def to_dask(parts, dtype): import dask.bag as db return db.from_delayed(parts)
def dataset_to_bag(dataset): return db.from_delayed( [delayed(lambda x: [x])(chunk) for chunk in split_by_chunks(dataset)])
def read_avro(urlpath, blocksize=100000000, storage_options=None, compression=None): """Read set of avro files Use this with arbitrary nested avro schemas. Please refer to the fastavro documentation for its capabilities: https://github.com/fastavro/fastavro Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. blocksize: int or None Size of chunks in bytes. If None, there will be no chunking and each file will become one partition. storage_options: dict or None passed to backend file-system compression: str or None Compression format of the targe(s), like 'gzip'. Should only be used with blocksize=None. """ from dask.utils import import_required from dask import delayed, compute from dask.bytes.core import (open_files, get_fs_token_paths, OpenFile, tokenize) from dask.bag import from_delayed import_required( 'fastavro', "fastavro is a required dependency for using " "bag.read_avro().") storage_options = storage_options or {} if blocksize is not None: fs, fs_token, paths = get_fs_token_paths( urlpath, mode='rb', storage_options=storage_options) dhead = delayed(open_head) out = compute(*[dhead(fs, path, compression) for path in paths]) heads, sizes = zip(*out) dread = delayed(read_chunk) offsets = [] lengths = [] for size in sizes: off = list(range(0, size, blocksize)) length = [blocksize] * len(off) offsets.append(off) lengths.append(length) out = [] for path, offset, length, head in zip(paths, offsets, lengths, heads): delimiter = head['sync'] f = OpenFile(fs, path, compression=compression) token = tokenize(fs_token, delimiter, path, fs.ukey(path), compression, offset) keys = ['read-avro-%s-%s' % (o, token) for o in offset] values = [ dread(f, o, l, head, dask_key_name=key) for o, key, l in zip(offset, keys, length) ] out.extend(values) return from_delayed(out) else: files = open_files(urlpath, **storage_options) dread = delayed(read_file) chunks = [dread(fo) for fo in files] return from_delayed(chunks)
def get_default_rules(): """ Return the list of default rules used to interpret expressions in :func:`apply()<flowly.dsk.apply>`. Each rules has an additional property ``.name``, that may be useful when modifying the rules. """ return [ adict( name='builtins.sum', match=_match_equal(builtins.sum), apply=_methodcaller('sum'), ), adict( name='builtins.any', match=_match_equal(builtins.all), apply=_methodcaller('all'), ), adict( name='builtins.any', match=_match_equal(builtins.any), apply=_methodcaller('any'), ), adict( name='builtins.len', match=_match_equal(builtins.len), apply=_methodcaller('count'), ), adict( name='builtins.max', match=_match_equal(builtins.max), apply=_methodcaller('max'), ), adict( name='builtins.min', match=_match_equal(builtins.min), apply=_methodcaller('min'), ), adict( name='toolz.concat', match=_match_equal(toolz.concat), apply=lambda bag, transform, rules: bag.concat(), ), adict( name='toolz.compose', match=_match_isinstance(toolz.functoolz.Compose), apply=_apply__toolz__compose, ), adict( name='toolz.count', match=_match_equal(count), apply=_methodcaller('count'), ), adict( name='toolz.frequencies', match=_match_equal(frequencies), apply=_methodcaller('frequencies'), ), adict( name='toolz.curried.filter', match=_match_curried(builtins.filter), apply=lambda bag, transform, rules: bag.filter(*transform.args, **transform.keywords) ), adict( name='toolz.curried.map', match=_match_curried(builtins.map), apply=lambda bag, transform, rules: bag.map(*transform.args, **transform.keywords), ), adict( name='toolz.curried.mapcat', match=_match_curried(toolz.mapcat), apply=lambda bag, transform, rules: bag.map(*transform.args).concat(), ), adict( name='toolz.curried.pluck', match=_match_curried(toolz.pluck), apply=lambda bag, transform, rules: bag.pluck(*transform.args, **transform.keywords), ), adict( name='toolz.curried.random_sample', match=_match_curried(toolz.random_sample), apply=lambda bag, transform, rules: bag.random_sample(*transform.args, **transform.keywords), ), adict( name='toolz.curried.reduce', match=_match_curried(ft.reduce), apply=lambda bag, transform, rules: bag.reduction( lambda i: i, lambda partitions: ft.reduce(transform.args[0], it.chain.from_iterable(partitions)), ) ), adict( name='toolz.curried.remove', match=_match_curried(toolz.remove), apply=lambda bag, transform, rules: bag.remove(*transform.args, **transform.keywords), ), adict( name='toolz.curried.take', match=_match_curried(toolz.take), apply=lambda bag, transform, rules: bag.take( *transform.args, **merge(transform.keywords, dict(compute=False, npartitions=-1)) ), ), adict( name='toolz.curried.topk', match=_match_curried(toolz.topk), apply=lambda bag, transform, rules: bag.topk(*transform.args, **transform.keywords), ), adict( name='toolz.curried.groupby', match=_match_curried(toolz.groupby), apply=lambda *args, **kwargs: raise_(ValueError, 'use flowly.tz.groupby') ), adict( name='toolz.curried.reduceby', match=_match_curried(toolz.reduceby), apply=lambda *args, **kwargs: raise_(ValueError, 'use flowly.tz.reduceby') ), adict( name='toolz.unique', match=_match_equal(toolz.unique), apply=_methodcaller('distinct'), ), adict( name='itertools.chain.from_iterable', match=_match_equal(it.chain.from_iterable), apply=lambda bag, transform, rules: bag.concat(), ), adict( name='flowly.checkpoint.with_checkpoint', match=_match_isinstance(with_checkpoint), apply=_apply__checkpoint__with_checkpoint, ), adict( name='flowly.tz.apply_concat', match=_match_isinstance(apply_concat), apply=_apply__flowly__tz__apply_concat, ), adict( name='flowly.tz.apply_map_concat', match=_match_isinstance(apply_map_concat), apply=_apply__flowly__tz__apply_map_concat, ), adict( name='flowly.tz.build_dict', match=_match_isinstance(build_dict), apply=_build_dask_dict, ), adict( name='flowly.tz.itemsetter', match=_match_isinstance(itemsetter), apply=_update_dask_dict, ), adict( name='flowly.tz.chained', match=_match_isinstance(chained), apply=_apply__flowly__tz__chained, ), adict( name='flowly.tz.groupby', match=_match_isinstance(groupby), # TODO: inject remaining arguments into groupby apply=lambda bag, transform, rules: bag.groupby(transform.key), ), adict( name='flowlfy.tz.kv_keymap', match=_match_isinstance(kv_keymap), apply=lambda bag, transform, rules: apply( toolz.curried.map(lambda t: (transform.func(t[0]), t[1])), bag, rules=rules, ), ), adict( name='flowly.tz.kv_valmap', match=_match_isinstance(kv_valmap), apply=lambda bag, transform, rules: apply( toolz.curried.map(lambda t: (t[0], transform.func(t[1]))), bag, rules=rules, ), ), adict( name='flowly.tz.kv_reduceby', match=_match_isinstance(kv_reduceby), apply=_apply_kv_reduceby, ), adict( name='flowly.tz.kv_reductionby', match=_match_isinstance(kv_reductionby), apply=_apply_kv_reductionby, ), adict( name='flowly.tz.reduceby', match=_match_isinstance(reduceby), apply=lambda bag, transform, rules: ( bag .groupby(transform.key) .map(lambda t: (t[0], ft.reduce(transform.binop, t[1]))) ), ), adict( name='flowly.tz.reduction', match=_match_isinstance(reduction), apply=lambda bag, transform, rules: bag.reduction( transform.perpartition, transform.aggregate, split_every=transform.split_every, ), ), adict( name='flowly', match=_match_isinstance(reductionby), apply=_apply_reductionby, ), adict( name='flowly.tz.seq', match=_match_equal(seq), apply=lambda item, transform, rules: db.from_delayed([ item.apply(lambda i: [i]).to_delayed() ]) ), # TODO: let any curried callable fallback to the callable itself, if not args were given # TODO: add option to skip arbitrary callables and add marker functions to annotate them adict( name='callable', match=lambda bag, transform, rules: callable(transform), apply=lambda bag, transform, rules: transform(bag), ) ]