Exemplo n.º 1
0
def read_avro(urlpath, blocksize=100000000, storage_options=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import open_files, read_bytes
    from dask.bag import from_delayed
    import_required('fastavro',
                    "fastavro is a required dependency for using "
                    "bag.read_avro().")

    storage_options = storage_options or {}
    files = open_files(urlpath, **storage_options)
    if blocksize is not None:
        dhead = delayed(open_head)
        heads = compute(*[dhead(f) for f in files])
        dread = delayed(read_chunk)
        bits = []
        for head, f in zip(heads, files):
            _, chunks = read_bytes(f.path, sample=False, blocksize=blocksize,
                                   delimiter=head['sync'], include_path=False,
                                   **storage_options)
            bits.extend([dread(ch, head) for ch in chunks[0]])
        return from_delayed(bits)
    else:
        files = open_files(urlpath, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
def _ddfs_to_bag(data, cube):
    if not isinstance(data, dict):
        data = {cube.seed_dataset: data}

    ktk_cube_dataset_ids = sorted(data.keys())
    bags = []
    for ktk_cube_dataset_id in ktk_cube_dataset_ids:
        bags.append(
            db.from_delayed(data[ktk_cube_dataset_id].to_delayed()).map_partitions(
                _convert_write_bag, ktk_cube_dataset_id=ktk_cube_dataset_id
            )
        )

    return (db.concat(bags), ktk_cube_dataset_ids)
Exemplo n.º 3
0
def test_from_delayed_iterator():
    from dask.delayed import delayed

    def lazy_records(n):
        return ({'operations': [1, 2]} for _ in range(n))

    delayed_records = delayed(lazy_records, pure=False)
    bag = db.from_delayed([delayed_records(5) for _ in range(5)])
    assert db.compute(
        bag.count(),
        bag.pluck('operations').count(),
        bag.pluck('operations').concat().count(),
        get=dask.get,
    ) == (25, 25, 50)
Exemplo n.º 4
0
def test_from_delayed_iterator():
    from dask.delayed import delayed

    def lazy_records(n):
        return ({"operations": [1, 2]} for _ in range(n))

    delayed_records = delayed(lazy_records, pure=False)
    bag = db.from_delayed([delayed_records(5) for _ in range(5)])
    assert (db.compute(
        bag.count(),
        bag.pluck("operations").count(),
        bag.pluck("operations").flatten().count(),
        scheduler="sync",
    ) == (25, 25, 50))
Exemplo n.º 5
0
def test_from_delayed_iterator():
    from dask.delayed import delayed

    def lazy_records(n):
        return ({'operations': [1, 2]} for _ in range(n))

    delayed_records = delayed(lazy_records, pure=False)
    bag = db.from_delayed([delayed_records(5) for _ in range(5)])
    assert db.compute(
        bag.count(),
        bag.pluck('operations').count(),
        bag.pluck('operations').flatten().count(),
        get=dask.get,
    ) == (25, 25, 50)
Exemplo n.º 6
0
    def load(self, dataset, subset, key, count, start):
        import binascii
        rpc_connection = dask.delayed(ycore.create_rpc_conn)(dataset)
        if count == -1:
            count = dask.delayed(rpc_connection.liststreams)(subset)[
                0]['items']
        else:
            count = dask.delayed(count)
        start = dask.delayed(start)

        xset = dask.delayed(rpc_connection.liststreamkeyitems)(
            subset, key, False, count, start)
        xbag = db.from_delayed(xset).pluck('data').map(binascii.unhexlify)
        # xbag = xbag.map(binascii.unhexlify)
        return xbag
Exemplo n.º 7
0
def to_dask(source):
    chunksize = 100000  # FIXME: Where should this come from?

    futures = [read_partition(source, i) for i in range(source.npartitions)]

    if source.container == 'ndarray':
        # array_parts = [da.from_delayed(f, shape=c.shape, dtype=c.dtype) for f, c in zip(futures, chunks)]
        # return da.concatenate(array_parts, axis=0)
        raise ValueError('FIXME: Support ndarray concatenation')
    elif source.container == 'dataframe':
        return dd.from_delayed(futures)
    elif source.container == 'list':
        return db.from_delayed(futures)
    else:
        raise ValueError('Unknown container type: %s' % source.container)
Exemplo n.º 8
0
def to_dask(source):

    futures = [read_partition(source, i) for i in range(source.npartitions)]

    if source.container == 'ndarray':
        # array_parts = [da.from_delayed(f, shape=c.shape, dtype=c.dtype) for f, c in zip(futures, chunks)]
        # return da.concatenate(array_parts, axis=0)
        raise ValueError('FIXME: Support ndarray concatenation')
    elif source.container == 'dataframe':
        import dask.dataframe as dd
        return dd.from_delayed(futures)
    elif source.container == 'list':
        import dask.bag as db
        return db.from_delayed(futures)
    else:
        raise ValueError('Unknown container type: %s' % source.container)
def main(files, client):

    bag = db.from_delayed([delayed(parse_single_file)(fn) for fn in files])
    df = bag.to_dataframe(columns=columns)

    # Nonstandard and inconsistent date formats in input.
    # These two lines standardize to ISO.
    df['endtime'] = df['endtime'].astype(np.datetime64)

    df['cumul_entries'] = df.cumul_entries.astype(np.int64)
    df['cumul_exits'] = df.cumul_exits.astype(np.int64)

    df = df.repartition(npartitions=16)

    df.to_parquet(os.path.join(config['parquet_output_path'], 'subway.parquet'),
                  compression='SNAPPY', object_encoding='json'
                  )
Exemplo n.º 10
0
def extract_entities_and_predicates_from_sentences(
    sentence_records: dbag.Bag,
    semrep_install_dir: Path,
    unicode_to_ascii_jar_path: Path,
    work_dir: Path,
    lexicon_year: int,
    mm_data_year: str,
    mm_data_version: str,
) -> dbag.Bag:
  """Runs each sentence through SemRep. Identifies Predicates and Entities

  Requires get_metamap_server_initializer added to dask_process_global.

  Args:
    sentence_records: Each record needs `id` and `sent_text`.
    work_dir: A directory visible to all workers where SemRep intermediate files
      will be stored.
    semrep_install_dir: The path where semrep was installed.

  Returns:
    One record per input sentence, where `id` of the new record matches the
    input. However, returned records will only have `entites` and `predicates`

  """

  work_dir = Path(work_dir)
  assert work_dir.is_dir(), f"Failed to find shared work_dir: {work_dir}"
  semrep_input_dir = work_dir.joinpath("input_files")
  semrep_output_dir = work_dir.joinpath("output_files")
  semrep_input_dir.mkdir(exist_ok=True, parents=True)
  semrep_output_dir.mkdir(exist_ok=True, parents=True)

  semrep_tasks = []
  for part_idx, partition in enumerate(sentence_records.to_delayed()):
    semrep_input_path = semrep_input_dir.joinpath(f"input_{part_idx}.txt")
    # semrep_output_path = semrep_output_dir.joinpath(f"ouput_{part_idx}.xml")
    semrep_tasks.append(dask.delayed(_sentence_partition_to_records)(
        records=partition,
        unicode_to_ascii_jar_path=unicode_to_ascii_jar_path,
        input_path=semrep_input_path,
        semrep_install_dir=semrep_install_dir,
        lexicon_year=lexicon_year,
        mm_data_year=mm_data_year,
        mm_data_version=mm_data_version,
    ))
  return dbag.from_delayed(semrep_tasks)
def main(files, client):

    bag = db.from_delayed([delayed(parse_single_file)(fn) for fn in files])
    df = bag.to_dataframe(columns=columns)

    # Nonstandard and inconsistent date formats in input.
    # These two lines standardize to ISO.
    df['endtime'] = df['endtime'].astype(np.datetime64)

    df['cumul_entries'] = df.cumul_entries.astype(np.int64)
    df['cumul_exits'] = df.cumul_exits.astype(np.int64)

    df = df.repartition(npartitions=16)

    df.to_parquet(os.path.join(config['parquet_output_path'],
                               'subway.parquet'),
                  compression='SNAPPY',
                  object_encoding='json')
Exemplo n.º 12
0
def main():
    fp = '/project/euro4_hindcast/WIND-ATLAS_EURO4-RERUN/2015/06/18Z/'
    # fn = 'EURO4_2015060[1-3].pp'
    fn = '*.pp'
    dlyd = load_subset(fp, fn)
    cs = db.from_delayed(dlyd)
    cubes = iris.cube.CubeList(cs.compute())
    # The x- and y-wind cubes are on different domains. This notwithstanding,
    # the x-wind cube also has one more latitude point than the y-wind cube,
    # which we arbitrarily chop off.
    x_wind_cube = cubes[0][..., :-1, :]
    y_wind_cube = cubes[1]

    wspd_cube, theta_cube = xy_to_wspd_and_dir(x_wind_cube, y_wind_cube)

    wspd_var_cube, wdir_var_cube = mln_variance(wspd_cube, theta_cube)
    print wspd_var_cube
    print wdir_var_cube
    wspd_var_data = wspd_var_cube.data
    wdir_var_data = wdir_var_cube.data
Exemplo n.º 13
0
def get_comments(pages: int) -> DataFrame:
    """Get List of comments per page"""
    comments_delayed = (get_elements_json(url=COMMENTS_ENDPOINT,
                                          page=str(page),
                                          file="comments.dat")
                        for page in range(1, pages + 1))

    comments_bag = db.from_delayed(comments_delayed).map(
        lambda d: {
            "id": d["id"],
            "post": d["post"],
            "author_name": d["author_name"],
            "date": create_datetime(d["date"]),
            "content": d["content"]["rendered"],
            "link": d["link"],
        })
    # In Windows the Dask.Bag is multiprocessing by default, change to threads
    with dask.config.set(scheduler="threads"):
        comments: List[dict] = comments_bag.compute()
    df_comments = DataFrame(comments)
    return df_comments
Exemplo n.º 14
0
    def read_avro(self, file_path, blocksize=1048576):
        """
        Read the downloaded query into avro

        Args:
            file_path (str):
                A message from the GDAX websocket API

            blocksize (int):
                Size of blocks. Note that this size must be larger than the 
                files' internal block size, otheriwse it will result in 
                empty partitions. Default is 1MB

        :rtype: dask.delayed
        :returns: a dask delayed object

        """
        from dask_avro import read_avro
        from dask.bag import from_delayed

        delayed_avro = read_avro(file_path, blocksize=blocksize)
        return from_delayed(delayed_avro)
Exemplo n.º 15
0
def cull_empty_partitions(bag):
    """When bags are created by filtering or grouping from a different bag,
    it retains the original bag's partition count, even if a lot of the
    partitions become empty.
    Those extra partitions add overhead, so it's nice to discard them.
    This function drops the empty partitions.

    Parameters
    ----------
    bag: dask.bag

    Returns
    -------
    partitions: dask.bag
    """
    bag = bag.persist()

    def get_len(partition):
        # If the bag is the result of bag.filter(),
        # then each partition is actually a 'filter' object,
        # which has no __len__.
        # In that case, we must convert it to a list first.
        if hasattr(partition, '__len__'):
            return len(partition)
        return len(list(partition))

    partition_lengths = bag.map_partitions(get_len).compute()

    # Convert bag partitions into a list of 'delayed' objects
    lengths_and_partitions = zip(partition_lengths, bag.to_delayed())

    # Drop the ones with empty partitions
    partitions = (p for l, p in lengths_and_partitions if l > 0)

    # return list of delayed objects
    return db.from_delayed(partitions)
Exemplo n.º 16
0
 def to_dask(self):
     import dask.bag as db
     from dask import delayed
     dfile = delayed(get_file)
     return db.from_delayed([dfile(f) for f in self._files])
Exemplo n.º 17
0
def get_medline_documents(config: cpb.ConstructConfig, ) -> dbag.Bag:
    medline_dir = Path(config.medline_xml_dir)
    medline_dir.mkdir(parents=True, exist_ok=True)
    assert medline_dir.is_dir(), f"Failed to make {config.medline_xml_dir}"

    # Download all of pubmed. ####
    if not config.skip_ftp_download:
        print("Downloading pubmed XML Files")
        with ftp_util.ftp_connect(
                address=config.ftp.address,
                workdir=config.ftp.workdir,
        ) as conn:
            # Downloads new files if not already present in shared
            xml_paths = ftp_util.ftp_retreive_all(
                conn=conn,
                pattern="^.*\.xml\.gz$",
                directory=medline_dir,
                show_progress=True,
            )
        if config.ftp.include_daily_updates:
            print("Downloading daily updates...")
            with ftp_util.ftp_connect(
                    address=config.ftp.address,
                    workdir=config.ftp.workdir_daily,
            ) as conn:
                # Downloads new files if not already present in shared
                xml_paths_daily = ftp_util.ftp_retreive_all(
                    conn=conn,
                    pattern="^.*\.xml\.gz$",
                    directory=medline_dir,
                    show_progress=True,
                )
                xml_paths += xml_paths_daily
    else:
        print(f"Skipping FTP download, using {medline_dir}/*.xml.gz instead")
        assert medline_dir.is_dir(), f"Cannot find {medline_dir}"
        xml_paths = list(medline_dir.glob("*.xml.gz"))
        assert len(xml_paths) > 0, f"No .xml.gz files inside {medline_dir}"

    if config.debug.enable:
        print(f"\t- Downsampling {len(xml_paths)} xml files to only "
              f"{config.debug.partition_subset_size}.")
        # Takes the top x (typically larger)
        xml_paths = xml_paths[-config.debug.partition_subset_size:]

    # Parse xml-files per-partition
    medline_documents = dbag.from_delayed([
        dask.delayed(parse_pubmed_xml.parse_zipped_pubmed_xml)(xml_path=p, )
        for p in xml_paths
    ])

    if not config.allow_nonenglish_abstracts:
        medline_documents = medline_documents.filter(
            # Only take the english ones
            lambda r: r["language"] == "eng")

    if config.HasField("cut_date"):
        # This will fail if the cut-date is not a valid string
        datetime.strptime(config.cut_date, "%Y-%m-%d")
        medline_documents = medline_documents.filter(
            lambda r: r["date"] < config.cut_date)

    if config.debug.enable:
        print("\t- Downsampling documents by "
              f"{config.debug.document_sample_rate}")
        medline_documents = medline_documents.random_sample(
            config.debug.document_sample_rate, )
    return medline_documents
Exemplo n.º 18
0
 def to_dask(self):
     """Return a dask-bag of results"""
     import dask.bag as db
     import dask.delayed
     dpart = dask.delayed(self._get_partition)
     return db.from_delayed([dpart(i) for i in range(self.npartitions)])
Exemplo n.º 19
0
def test_futures_to_delayed_bag(c):
    L = [1, 2, 3]

    futures = c.scatter([L, L])
    b = db.from_delayed(futures)
    assert list(b) == L + L
Exemplo n.º 20
0
    def execute(self):
        self._init_service()
        mgr_client = self.mgr_client

        options = self.config["stitchedmeshes"]

        server, uuid, instance = self.input_service.base_service.instance_triple
        is_supervoxels = self.input_service.base_service.supervoxels
        bodies = load_body_list(options["bodies"], is_supervoxels)

        logger.info(f"Input is {len(bodies)} bodies")
        os.makedirs(options["output-directory"], exist_ok=True)

        def make_bricks(coord_and_block):
            coord_zyx, block_vol = coord_and_block
            logical_box = np.array((coord_zyx, coord_zyx + block_vol.shape))
            return Brick(logical_box,
                         logical_box,
                         block_vol,
                         location_id=(logical_box // 64))

        rescale = (2**options["scale"]) * options["extra-rescale"]

        def create_brick_mesh(brick):
            mesh = Mesh.from_binary_vol(brick.volume, brick.physical_box)
            if rescale != 1.0:
                mesh.vertices_zyx *= rescale
            return mesh

        def create_combined_mesh(meshes):
            mesh = concatenate_meshes(meshes, False)
            if options["stitch"]:
                mesh.stitch_adjacent_faces(drop_unused_vertices=True,
                                           drop_duplicate_faces=True)
            mesh.laplacian_smooth(options["smoothing-iterations"])
            mesh.simplify(options["decimation-fraction"], in_memory=True)
            return mesh

        in_flight = 0

        # Support synchronous testing with a fake 'as_completed' object
        if hasattr(self.client, 'DEBUG'):
            result_futures = as_completed_synchronous()
        else:
            result_futures = as_completed()

        def pop_result():
            nonlocal in_flight
            r = next(result_futures)
            in_flight -= 1

            try:
                return r.result()
            except Exception as ex:
                if options["error-mode"] == "raise":
                    raise
                body = int(r.key)
                return (body, 0, 'error', str(ex))

        USER = getpass.getuser()
        results = []
        try:
            for i, body in enumerate(bodies):
                logger.info(f"Mesh #{i}: Body {body}: Starting")

                def fetch_sparsevol():
                    with mgr_client.access_context(server, True, 1, 0):
                        ns = default_node_service(server, uuid,
                                                  'flyemflows-stitchedmeshes',
                                                  USER)
                        coords_zyx, blocks = ns.get_sparselabelmask(
                            body, instance, options["scale"], is_supervoxels)
                        return list(coords_zyx.copy()), list(blocks.copy())

                # This leaves all blocks and bricks in a single partition,
                # but we're about to do a shuffle anyway when the bricks are realigned.
                coords, blocks = delayed(fetch_sparsevol, nout=2)()
                coords, blocks = db.from_delayed(coords), db.from_delayed(
                    blocks)
                bricks = db.zip(coords, blocks).map(make_bricks)

                mesh_grid = Grid((64, 64, 64), halo=options["block-halo"])
                wall = BrickWall(None, (64, 64, 64), bricks)
                wall = wall.realign_to_new_grid(mesh_grid)

                brick_meshes = wall.bricks.map(create_brick_mesh)
                consolidated_brick_meshes = brick_meshes.repartition(1)
                combined_mesh = delayed(create_combined_mesh)(
                    consolidated_brick_meshes)

                def write_mesh(mesh):
                    output_dir = options["output-directory"]
                    fmt = options["format"]
                    output_path = f'{output_dir}/{body}.{fmt}'
                    mesh.serialize(output_path)
                    return (body, len(mesh.vertices_zyx), 'success', '')

                # We hide the body ID in the task name, so that we can record it in pop_result
                task = delayed(write_mesh)(combined_mesh,
                                           dask_key_name=f'{body}')
                result_futures.add(self.client.compute(task))
                in_flight += 1

                assert in_flight <= options["concurrent-bodies"]
                while in_flight == options["concurrent-bodies"]:
                    body, vertices, result, msg = pop_result()
                    if result == "error":
                        logger.warning(
                            f"Body {body}: Failed to generate mesh: {msg}")
                    results.append((body, vertices, result, msg))

            # Flush the last batch of tasks
            while in_flight > 0:
                body, vertices, result, msg = pop_result()
                if result == "error":
                    logger.warning(
                        f"Body {body}: Failed to generate mesh: {msg}")
                results.append((body, vertices, result, msg))
        finally:
            stats_df = pd.DataFrame(
                results, columns=['body', 'vertices', 'result', 'msg'])
            stats_df.to_csv('mesh-stats.csv', index=False, header=True)

            failed_df = stats_df.query("result != 'success'")
            if len(failed_df) > 0:
                logger.warning(
                    f"Failed to create meshes for {len(failed_df)} bodies.  See mesh-stats.csv"
                )
Exemplo n.º 21
0
    except requests.exceptions.InvalidSchema:
        return []  # Skip non-web links


dask.compute(crawl("http://holdenkarau.com/"))
#end::mini_crawl_task[]

# In[ ]:

#tag::make_bag_of_crawler[]
import dask.bag as db
githubs = [
    "https://github.com/scalingpythonml/scalingpythonml",
    "https://github.com/dask/distributed"
]
initial_bag = db.from_delayed(map(crawl, githubs))
#end::make_bag_of_crawler[]

# In[ ]:

#tag::make_a_bag_of_words[]
words_bag = initial_bag.map(
    lambda url_contents: url_contents[1].split(" ")).flatten()
#end::make_a_bag_of_words[]

# In[ ]:

#tag::wc_freq[]
dask.compute(words_bag.frequencies())

#end::wc_freq[]
Exemplo n.º 22
0
def read_avro(urlpath, blocksize=100000000, storage_options=None,
              compression=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    compression: str or None
        Compression format of the targe(s), like 'gzip'. Should only be used
        with blocksize=None.
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import (open_files, get_fs_token_paths,
                                 OpenFile, tokenize)
    from dask.bag import from_delayed
    import_required('fastavro',
                    "fastavro is a required dependency for using "
                    "bag.read_avro().")

    storage_options = storage_options or {}
    if blocksize is not None:
        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode='rb', storage_options=storage_options)
        dhead = delayed(open_head)
        out = compute(*[dhead(fs, path, compression) for path in paths])
        heads, sizes = zip(*out)
        dread = delayed(read_chunk)

        offsets = []
        lengths = []
        for size in sizes:
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            offsets.append(off)
            lengths.append(length)

        out = []
        for path, offset, length, head in zip(paths, offsets, lengths, heads):
            delimiter = head['sync']
            f = OpenFile(fs, path, compression=compression)
            token = tokenize(fs_token, delimiter, path, fs.ukey(path),
                             compression, offset)
            keys = ['read-avro-%s-%s' % (o, token) for o in offset]
            values = [dread(f, o, l, head, dask_key_name=key)
                      for o, key, l in zip(offset, keys, length)]
            out.extend(values)

        return from_delayed(out)
    else:
        files = open_files(urlpath, compression=compression, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
Exemplo n.º 23
0
 def to_dask(self):
     import dask.bag as db
     import dask
     return db.from_delayed([dask.delayed(self._get_partition)(i)
                             for i in range(self.npartitions)])
Exemplo n.º 24
0
 def to_dask(self):
     import dask.delayed
     import dask.bag as db
     dpart = dask.delayed(read_stream)
     parts = [dpart(stream) for stream in self._streams]
     return db.from_delayed(parts)
Exemplo n.º 25
0
def get_default_rules():
    """
    Return the list of default rules used to interpret expressions in
    :func:`apply()<flowly.dsk.apply>`.

    Each rules has an additional property ``.name``, that may be useful when
    modifying the rules.
    """
    return [
        adict(
            name='builtins.sum',
            match=_match_equal(builtins.sum),
            apply=_methodcaller('sum'),
        ),
        adict(
            name='builtins.any',
            match=_match_equal(builtins.all),
            apply=_methodcaller('all'),
        ),
        adict(
            name='builtins.any',
            match=_match_equal(builtins.any),
            apply=_methodcaller('any'),
        ),
        adict(
            name='builtins.len',
            match=_match_equal(builtins.len),
            apply=_methodcaller('count'),
        ),
        adict(
            name='builtins.max',
            match=_match_equal(builtins.max),
            apply=_methodcaller('max'),
        ),
        adict(
            name='builtins.min',
            match=_match_equal(builtins.min),
            apply=_methodcaller('min'),
        ),
        adict(
            name='toolz.concat',
            match=_match_equal(toolz.concat),
            apply=lambda bag, transform, rules: bag.concat(),
        ),
        adict(
            name='toolz.compose',
            match=_match_isinstance(toolz.functoolz.Compose),
            apply=_apply__toolz__compose,
        ),
        adict(
            name='toolz.count',
            match=_match_equal(count),
            apply=_methodcaller('count'),
        ),
        adict(
            name='toolz.frequencies',
            match=_match_equal(frequencies),
            apply=_methodcaller('frequencies'),
        ),
        adict(name='toolz.curried.filter',
              match=_match_curried(builtins.filter),
              apply=lambda bag, transform, rules: bag.filter(
                  *transform.args, **transform.keywords)),
        adict(
            name='toolz.curried.map',
            match=_match_curried(builtins.map),
            apply=lambda bag, transform, rules: bag.map(
                *transform.args, **transform.keywords),
        ),
        adict(
            name='toolz.curried.mapcat',
            match=_match_curried(toolz.mapcat),
            apply=lambda bag, transform, rules: bag.map(*transform.args).
            concat(),
        ),
        adict(
            name='toolz.curried.pluck',
            match=_match_curried(toolz.pluck),
            apply=lambda bag, transform, rules: bag.pluck(
                *transform.args, **transform.keywords),
        ),
        adict(
            name='toolz.curried.random_sample',
            match=_match_curried(toolz.random_sample),
            apply=lambda bag, transform, rules: bag.random_sample(
                *transform.args, **transform.keywords),
        ),
        adict(name='toolz.curried.reduce',
              match=_match_curried(ft.reduce),
              apply=lambda bag, transform, rules: bag.reduction(
                  lambda i: i,
                  lambda partitions: ft.reduce(
                      transform.args[0], it.chain.from_iterable(partitions)),
              )),
        adict(
            name='toolz.curried.remove',
            match=_match_curried(toolz.remove),
            apply=lambda bag, transform, rules: bag.remove(
                *transform.args, **transform.keywords),
        ),
        adict(
            name='toolz.curried.take',
            match=_match_curried(toolz.take),
            apply=lambda bag, transform, rules: bag.take(
                *transform.args,
                **merge(transform.keywords, dict(compute=False, npartitions=-1)
                        )),
        ),
        adict(
            name='toolz.curried.topk',
            match=_match_curried(toolz.topk),
            apply=lambda bag, transform, rules: bag.topk(
                *transform.args, **transform.keywords),
        ),
        adict(name='toolz.curried.groupby',
              match=_match_curried(toolz.groupby),
              apply=lambda *args, **kwargs: raise_(ValueError,
                                                   'use flowly.tz.groupby')),
        adict(name='toolz.curried.reduceby',
              match=_match_curried(toolz.reduceby),
              apply=lambda *args, **kwargs: raise_(ValueError,
                                                   'use flowly.tz.reduceby')),
        adict(
            name='toolz.unique',
            match=_match_equal(toolz.unique),
            apply=_methodcaller('distinct'),
        ),
        adict(
            name='itertools.chain.from_iterable',
            match=_match_equal(it.chain.from_iterable),
            apply=lambda bag, transform, rules: bag.concat(),
        ),
        adict(
            name='flowly.checkpoint.with_checkpoint',
            match=_match_isinstance(with_checkpoint),
            apply=_apply__checkpoint__with_checkpoint,
        ),
        adict(
            name='flowly.tz.apply_concat',
            match=_match_isinstance(apply_concat),
            apply=_apply__flowly__tz__apply_concat,
        ),
        adict(
            name='flowly.tz.apply_map_concat',
            match=_match_isinstance(apply_map_concat),
            apply=_apply__flowly__tz__apply_map_concat,
        ),
        adict(
            name='flowly.tz.build_dict',
            match=_match_isinstance(build_dict),
            apply=_build_dask_dict,
        ),
        adict(
            name='flowly.tz.itemsetter',
            match=_match_isinstance(itemsetter),
            apply=_update_dask_dict,
        ),
        adict(
            name='flowly.tz.chained',
            match=_match_isinstance(chained),
            apply=_apply__flowly__tz__chained,
        ),
        adict(
            name='flowly.tz.groupby',
            match=_match_isinstance(groupby),
            # TODO: inject remaining arguments into groupby
            apply=lambda bag, transform, rules: bag.groupby(transform.key),
        ),
        adict(
            name='flowlfy.tz.kv_keymap',
            match=_match_isinstance(kv_keymap),
            apply=lambda bag, transform, rules: apply(
                toolz.curried.map(lambda t: (transform.func(t[0]), t[1])),
                bag,
                rules=rules,
            ),
        ),
        adict(
            name='flowly.tz.kv_valmap',
            match=_match_isinstance(kv_valmap),
            apply=lambda bag, transform, rules: apply(
                toolz.curried.map(lambda t: (t[0], transform.func(t[1]))),
                bag,
                rules=rules,
            ),
        ),
        adict(
            name='flowly.tz.kv_reduceby',
            match=_match_isinstance(kv_reduceby),
            apply=_apply_kv_reduceby,
        ),
        adict(
            name='flowly.tz.kv_reductionby',
            match=_match_isinstance(kv_reductionby),
            apply=_apply_kv_reductionby,
        ),
        adict(
            name='flowly.tz.reduceby',
            match=_match_isinstance(reduceby),
            apply=lambda bag, transform, rules:
            (bag.groupby(transform.key).map(lambda t: (t[
                0], ft.reduce(transform.binop, t[1])))),
        ),
        adict(
            name='flowly.tz.reduction',
            match=_match_isinstance(reduction),
            apply=lambda bag, transform, rules: bag.reduction(
                transform.perpartition,
                transform.aggregate,
                split_every=transform.split_every,
            ),
        ),
        adict(
            name='flowly',
            match=_match_isinstance(reductionby),
            apply=_apply_reductionby,
        ),
        adict(name='flowly.tz.seq',
              match=_match_equal(seq),
              apply=lambda item, transform, rules: db.from_delayed(
                  [item.apply(lambda i: [i]).to_delayed()])),
        # TODO: let any curried callable fallback to the callable itself, if not args were given
        # TODO: add option to skip arbitrary callables and add marker functions to annotate them
        adict(
            name='callable',
            match=lambda bag, transform, rules: callable(transform),
            apply=lambda bag, transform, rules: transform(bag),
        )
    ]
Exemplo n.º 26
0
async def run():
    number_of_cores_per_node = 16  # DAS-5 features 2x8 NUMA cores per compute node
    reservation_length = "08:00:00"  # 2 hours is more than enough... probably
    cluster = SLURMCluster(cores=number_of_cores_per_node,
                           memory="64 GB",
                           processes=4,
                           scheduler_options={"dashboard_address": ":6868"},
                           local_directory="./aip-logs",
                           interface='ib0',
                           walltime=reservation_length)

    # Grab 5 execution nodes -> 80 cores
    print("Scaling up, getting 5 nodes")
    cluster.scale_up(5)
    client = Client(cluster)

    print("Client is ready, parsing data files...")

    file_locations = "/var/scratch/lvs215/aip_tmp"
    data_files = []

    # Create a list of all the files we want to parse. Skip the compressed sources if they are still lingering around
    for path, subdirs, files in os.walk(file_locations):
        for name in files:
            if isfile(os.path.join(path, name)) and not name.endswith(
                ("gz", "zip", "tar")):
                data_files.append(os.path.join(path, name))

    client.run(clear_all_files)

    # Create one task per file.
    print(data_files)
    print("Creating and executing tasks...")
    tasks = list(map(delayed(process_file), data_files))
    true_false_array = db.from_delayed(tasks)

    # DEBUG CODE
    # future = client.compute(true_false_array)
    # client.recreate_error_locally(future)

    # Time to compute them!
    start = datetime.datetime.now()
    res = true_false_array.compute()
    end = datetime.datetime.now()
    print(true_false_array)
    print(res)
    print("Tasks ran to completion! Copying databases.")
    if False not in true_false_array:  # If everything went alright, let all nodes copy their databases to the home dir.
        client.run(copy_database_to_home_folder)
        client.run(clear_all_files)
    else:
        print("Parsing one of the files went horribly wrong, quitting!")
        exit(-1)

    print("Beginning assembling of all databases into one!")
    # Now, each of the nodes has a local database file, we will now combine these databases into one.
    # We do this process sequentially, because we are not sure yet if SQLite likes it if all nodes do this in parallel.
    # TODO: test if we can do this procedure in each node through the copy_database_to_home_folder, would save copying data
    database_manager = DatabaseManager(
    )  # This creates an empty aip.db if it doesn't exists.
    con3 = database_manager.db  # Reuse the connection

    # based on https://stackoverflow.com/a/37138506
    os.makedirs(db_files_location, exist_ok=True)
    for file in [
            os.path.join(db_files_location, f)
            for f in os.listdir(db_files_location)
            if isfile(os.path.join(db_files_location, f)) and f.endswith(".db")
    ]:
        con3.execute("ATTACH '{}' as dba".format(file))

        con3.execute("BEGIN")
        for row in con3.execute(
                "SELECT * FROM dba.sqlite_master WHERE type='table'"):
            combine = "INSERT INTO " + row[1] + " SELECT * FROM dba." + row[1]
            print(combine)
            con3.execute(combine)
        con3.execute("detach database dba")
        con3.commit()
        # Now, delete the database as it has been copied.
        # os.remove("{}.db".format(hash(worker)))
    print("All done. Releasing all nodes.")
    await cluster.scale_down(cluster.workers)
    print("Nodes released.")
    print(end - start)
Exemplo n.º 27
0
 def to_dask(parts, dtype):
     import dask.bag as db
     return db.from_delayed(parts)
Exemplo n.º 28
0
 def to_dask(parts, dtype):
     import dask.bag as db
     return db.from_delayed(parts)
Exemplo n.º 29
0
def dataset_to_bag(dataset):
    return db.from_delayed(
        [delayed(lambda x: [x])(chunk) for chunk in split_by_chunks(dataset)])
Exemplo n.º 30
0
def read_avro(urlpath,
              blocksize=100000000,
              storage_options=None,
              compression=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    compression: str or None
        Compression format of the targe(s), like 'gzip'. Should only be used
        with blocksize=None.
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import (open_files, get_fs_token_paths, OpenFile,
                                 tokenize)
    from dask.bag import from_delayed
    import_required(
        'fastavro', "fastavro is a required dependency for using "
        "bag.read_avro().")

    storage_options = storage_options or {}
    if blocksize is not None:
        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode='rb', storage_options=storage_options)
        dhead = delayed(open_head)
        out = compute(*[dhead(fs, path, compression) for path in paths])
        heads, sizes = zip(*out)
        dread = delayed(read_chunk)

        offsets = []
        lengths = []
        for size in sizes:
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            offsets.append(off)
            lengths.append(length)

        out = []
        for path, offset, length, head in zip(paths, offsets, lengths, heads):
            delimiter = head['sync']
            f = OpenFile(fs, path, compression=compression)
            token = tokenize(fs_token, delimiter, path, fs.ukey(path),
                             compression, offset)
            keys = ['read-avro-%s-%s' % (o, token) for o in offset]
            values = [
                dread(f, o, l, head, dask_key_name=key)
                for o, key, l in zip(offset, keys, length)
            ]
            out.extend(values)

        return from_delayed(out)
    else:
        files = open_files(urlpath, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
Exemplo n.º 31
0
Arquivo: dsk.py Projeto: chmp/flowly
def get_default_rules():
    """
    Return the list of default rules used to interpret expressions in
    :func:`apply()<flowly.dsk.apply>`.

    Each rules has an additional property ``.name``, that may be useful when
    modifying the rules.
    """
    return [
        adict(
            name='builtins.sum', match=_match_equal(builtins.sum), apply=_methodcaller('sum'),
        ),
        adict(
            name='builtins.any', match=_match_equal(builtins.all), apply=_methodcaller('all'),
        ),
        adict(
            name='builtins.any', match=_match_equal(builtins.any), apply=_methodcaller('any'),
        ),
        adict(
            name='builtins.len', match=_match_equal(builtins.len), apply=_methodcaller('count'),
        ),
        adict(
            name='builtins.max', match=_match_equal(builtins.max), apply=_methodcaller('max'),
        ),
        adict(
            name='builtins.min', match=_match_equal(builtins.min), apply=_methodcaller('min'),
        ),
        adict(
            name='toolz.concat',
            match=_match_equal(toolz.concat),
            apply=lambda bag, transform, rules: bag.concat(),
        ),
        adict(
            name='toolz.compose',
            match=_match_isinstance(toolz.functoolz.Compose),
            apply=_apply__toolz__compose,
        ),
        adict(
            name='toolz.count', match=_match_equal(count), apply=_methodcaller('count'),
        ),
        adict(
            name='toolz.frequencies',
            match=_match_equal(frequencies),
            apply=_methodcaller('frequencies'),
        ),
        adict(
            name='toolz.curried.filter',
            match=_match_curried(builtins.filter),
            apply=lambda bag, transform, rules: bag.filter(*transform.args, **transform.keywords)
        ),
        adict(
            name='toolz.curried.map',
            match=_match_curried(builtins.map),
            apply=lambda bag, transform, rules: bag.map(*transform.args, **transform.keywords),
        ),
        adict(
            name='toolz.curried.mapcat',
            match=_match_curried(toolz.mapcat),
            apply=lambda bag, transform, rules: bag.map(*transform.args).concat(),
        ),
        adict(
            name='toolz.curried.pluck',
            match=_match_curried(toolz.pluck),
            apply=lambda bag, transform, rules: bag.pluck(*transform.args, **transform.keywords),
        ),
        adict(
            name='toolz.curried.random_sample',
            match=_match_curried(toolz.random_sample),
            apply=lambda bag, transform, rules: bag.random_sample(*transform.args, **transform.keywords),
        ),
        adict(
            name='toolz.curried.reduce',
            match=_match_curried(ft.reduce),
            apply=lambda bag, transform, rules: bag.reduction(
                lambda i: i,
                lambda partitions: ft.reduce(transform.args[0], it.chain.from_iterable(partitions)),
            )
        ),
        adict(
            name='toolz.curried.remove',
            match=_match_curried(toolz.remove),
            apply=lambda bag, transform, rules: bag.remove(*transform.args, **transform.keywords),
        ),
        adict(
            name='toolz.curried.take',
            match=_match_curried(toolz.take),
            apply=lambda bag, transform, rules: bag.take(
                *transform.args,
                **merge(transform.keywords, dict(compute=False, npartitions=-1))
            ),
        ),
        adict(
            name='toolz.curried.topk',
            match=_match_curried(toolz.topk),
            apply=lambda bag, transform, rules: bag.topk(*transform.args, **transform.keywords),
        ),
        adict(
            name='toolz.curried.groupby',
            match=_match_curried(toolz.groupby),
            apply=lambda *args, **kwargs: raise_(ValueError, 'use flowly.tz.groupby')
        ),
        adict(
            name='toolz.curried.reduceby',
            match=_match_curried(toolz.reduceby),
            apply=lambda *args, **kwargs: raise_(ValueError, 'use flowly.tz.reduceby')
        ),
        adict(
            name='toolz.unique',
            match=_match_equal(toolz.unique),
            apply=_methodcaller('distinct'),
        ),
        adict(
            name='itertools.chain.from_iterable',
            match=_match_equal(it.chain.from_iterable),
            apply=lambda bag, transform, rules: bag.concat(),
        ),
        adict(
            name='flowly.checkpoint.with_checkpoint',
            match=_match_isinstance(with_checkpoint),
            apply=_apply__checkpoint__with_checkpoint,
        ),
        adict(
            name='flowly.tz.apply_concat',
            match=_match_isinstance(apply_concat),
            apply=_apply__flowly__tz__apply_concat,
        ),
        adict(
            name='flowly.tz.apply_map_concat',
            match=_match_isinstance(apply_map_concat),
            apply=_apply__flowly__tz__apply_map_concat,
        ),
        adict(
            name='flowly.tz.build_dict',
            match=_match_isinstance(build_dict),
            apply=_build_dask_dict,
        ),
        adict(
            name='flowly.tz.itemsetter',
            match=_match_isinstance(itemsetter),
            apply=_update_dask_dict,
        ),
        adict(
            name='flowly.tz.chained',
            match=_match_isinstance(chained),
            apply=_apply__flowly__tz__chained,
        ),
        adict(
            name='flowly.tz.groupby',
            match=_match_isinstance(groupby),
            # TODO: inject remaining arguments into groupby
            apply=lambda bag, transform, rules: bag.groupby(transform.key),
        ),
        adict(
            name='flowlfy.tz.kv_keymap',
            match=_match_isinstance(kv_keymap),
            apply=lambda bag, transform, rules: apply(
                toolz.curried.map(lambda t: (transform.func(t[0]), t[1])), bag, rules=rules,
            ),
        ),
        adict(
            name='flowly.tz.kv_valmap',
            match=_match_isinstance(kv_valmap),
            apply=lambda bag, transform, rules: apply(
                toolz.curried.map(lambda t: (t[0], transform.func(t[1]))), bag, rules=rules,
            ),
        ),
        adict(
            name='flowly.tz.kv_reduceby',
            match=_match_isinstance(kv_reduceby),
            apply=_apply_kv_reduceby,
        ),
        adict(
            name='flowly.tz.kv_reductionby',
            match=_match_isinstance(kv_reductionby),
            apply=_apply_kv_reductionby,
        ),
        adict(
            name='flowly.tz.reduceby',
            match=_match_isinstance(reduceby),
            apply=lambda bag, transform, rules: (
                bag
                .groupby(transform.key)
                .map(lambda t: (t[0], ft.reduce(transform.binop, t[1])))
            ),
        ),
        adict(
            name='flowly.tz.reduction',
            match=_match_isinstance(reduction),
            apply=lambda bag, transform, rules: bag.reduction(
                transform.perpartition, transform.aggregate, split_every=transform.split_every,
            ),
        ),
        adict(
            name='flowly',
            match=_match_isinstance(reductionby),
            apply=_apply_reductionby,
        ),
        adict(
            name='flowly.tz.seq',
            match=_match_equal(seq),
            apply=lambda item, transform, rules: db.from_delayed([
                item.apply(lambda i: [i]).to_delayed()
            ])
        ),
        # TODO: let any curried callable fallback to the callable itself, if not args were given
        # TODO: add option to skip arbitrary callables and add marker functions to annotate them
        adict(
            name='callable',
            match=lambda bag, transform, rules: callable(transform),
            apply=lambda bag, transform, rules: transform(bag),
        )
    ]