示例#1
0
def test_concat_two_empty_series(ignore_index, axis):
    s1 = gd.Series()
    s2 = gd.Series()
    ps1 = s1.to_pandas()
    ps2 = s2.to_pandas()
    got = gd.concat([s1, s2], axis=axis, ignore_index=ignore_index)
    expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index)

    assert_eq(got, expect)


@pytest.mark.parametrize(
    "df1,df2",
    [
        (
            gd.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}),
            gd.DataFrame({"k1": [1, 0], "k2": [3, 2], "v2": [6, 7]}),
        ),
        (
            gd.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}),
            gd.DataFrame({"k1": [0, 1], "k2": [3, 2], "v2": [6, 7]}),
        ),
    ],
)
def test_concat_dataframe_with_multiIndex(df1, df2):
    gdf1 = df1
    gdf1 = gdf1.set_index(["k1", "k2"])

    gdf2 = df2
    gdf2 = gdf2.set_index(["k1", "k2"])
示例#2
0
文件: graph.py 项目: orrrrtem/cugraph
    def degrees(self, vertex_subset=None):
        """
        Compute vertex in-degree and out-degree. By default, this method
        computes vertex degrees for the entire set of vertices. If
        vertex_subset is provided, this method optionally filters out all but
        those listed in vertex_subset.

        Parameters
        ----------
        vertex_subset : cudf.Series or iterable container, optional
            A container of vertices for displaying corresponding degree. If not
            set, degrees are computed for the entire set of vertices.

        Returns
        -------
        df : cudf.DataFrame
            df['vertex'] : cudf.Series
                The vertex IDs (will be identical to vertex_subset if
                specified).
            df['in_degree'] : cudf.Series
                The in-degree of the vertex.
            df['out_degree'] : cudf.Series
                The out-degree of the vertex.

        Examples
        --------
        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
        >>> sources = cudf.Series(M['0'])
        >>> destinations = cudf.Series(M['1'])
        >>> G = cugraph.Graph()
        >>> G.add_edge_list(sources, destinations, None)
        >>> df = G.degrees([0,9,12])

        """
        vertex_col, in_degree_col, out_degree_col = graph_new_wrapper._degrees(
            self)

        df = cudf.DataFrame()
        if vertex_subset is None:
            if self.renumbered is True:
                df['vertex'] = self.edgelist.renumber_map[vertex_col]
            else:
                df['vertex'] = vertex_col
            df['in_degree'] = in_degree_col
            df['out_degree'] = out_degree_col
        else:
            df['vertex'] = cudf.Series(
                np.asarray(vertex_subset, dtype=np.int32))
            if self.renumbered is True:
                renumber_series = cudf.Series(self.edgelist.renumber_map.index,
                                              index=self.edgelist.renumber_map)
                vertices_renumbered = renumber_series.loc[vertex_subset]

                df['in_degree'] = cudf.Series(
                    np.asarray([in_degree_col[i] for i in vertices_renumbered],
                               dtype=np.int32))
                df['out_degree'] = cudf.Series(
                    np.asarray(
                        [out_degree_col[i] for i in vertices_renumbered],
                        dtype=np.int32))
            else:
                df['in_degree'] = cudf.Series(
                    np.asarray([in_degree_col[i] for i in vertex_subset],
                               dtype=np.int32))
                df['out_degree'] = cudf.Series(
                    np.asarray([out_degree_col[i] for i in vertex_subset],
                               dtype=np.int32))

        return df
示例#3
0
def test_gpu_file_iterator_ds(df, dataset, batch, engine):
    df_itr = cudf.DataFrame()
    for data_gd in dataset.to_iter(columns=mycols_csv):
        df_itr = cudf.concat([df_itr, data_gd], axis=0) if df_itr else data_gd

    assert_eq(df_itr.reset_index(drop=True), df.reset_index(drop=True))
def test_mh_support(tmpdir, batch_size):
    data = {
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Reviewers": [
            ["User_A"],
            ["User_A", "User_E"],
            ["User_B", "User_C"],
            ["User_C"],
        ],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Embedding": [
            [0.1, 0.2, 0.3],
            [0.3, 0.4, 0.5],
            [0.6, 0.7, 0.8],
            [0.8, 0.4, 0.2],
        ],
        "Post": [1, 2, 3, 4],
    }
    df = cudf.DataFrame(data)
    cat_names = ["Authors", "Reviewers", "Engaging User"]
    cont_names = ["Embedding"]
    label_name = ["Post"]

    processor = nvt.Workflow(cat_names=cat_names,
                             cont_names=cont_names,
                             label_name=label_name)
    processor.add_preprocess(ops.HashBucket(num_buckets=10))
    processor.finalize()

    data_itr = tf_dataloader.KerasSequenceLoader(
        nvt.Dataset(df),
        cat_names=cat_names,
        cont_names=cont_names,
        label_names=label_name,
        batch_size=batch_size,
        shuffle=False,
    )
    data_itr.map(processor)

    idx = 0
    for X, y in data_itr:
        assert len(X) == 7
        n_samples = y.shape[0]

        for mh_name in ["Authors", "Reviewers", "Embedding"]:
            for postfix in ["__nnzs", "__values"]:
                assert (mh_name + postfix) in X
                array = X[mh_name + postfix].numpy()[:, 0]

                if postfix == "__nnzs":
                    if mh_name == "Embedding":
                        assert (array == 3).all()
                    else:
                        lens = [
                            len(x)
                            for x in data[mh_name][idx * batch_size:idx *
                                                   batch_size + n_samples]
                        ]
                        assert (array == np.array(lens)).all()
                else:
                    if mh_name == "Embedding":
                        assert len(array) == (n_samples * 3)
                    else:
                        assert len(array) == sum(lens)
        idx += 1
    assert idx == (3 // batch_size + 1)
示例#5
0
def _encode(
    name,
    storage_name,
    path,
    gdf,
    cat_cache,
    na_sentinel=-1,
    freq_threshold=0,
    search_sorted=False,
    buckets=None,
    encode_type="joint",
    cat_names=None,
):
    if isinstance(buckets, int):
        buckets = {name: buckets for name in cat_names}

    value = None
    selection_l = name if isinstance(name, list) else [name]
    selection_r = name if isinstance(name, list) else [storage_name]
    list_col = _is_list_col(selection_l, gdf)
    if path:
        if cat_cache is not None:
            cat_cache = (cat_cache if isinstance(cat_cache, str) else
                         cat_cache.get(storage_name, "disk"))
            if len(gdf):
                with get_worker_cache("cats") as cache:
                    value = fetch_table_data(cache,
                                             path,
                                             columns=selection_r,
                                             cache=cat_cache,
                                             cats_only=True)
        else:
            value = cudf.io.read_parquet(path,
                                         index=False,
                                         columns=selection_r)
            value.index.name = "labels"
            value.reset_index(drop=False, inplace=True)

    if value is None:
        value = cudf.DataFrame()
        for c in selection_r:
            typ = gdf[selection_l[0]].dtype if len(
                selection_l) == 1 else gdf[c].dtype
            value[c] = cudf.Series([None], dtype=typ)
        value.index.name = "labels"
        value.reset_index(drop=False, inplace=True)

    if not search_sorted:
        if list_col:
            codes = cudf.DataFrame(
                {selection_l[0]: gdf[selection_l[0]].list.leaves})
            codes["order"] = cp.arange(len(codes))
        else:
            codes = cudf.DataFrame({"order": cp.arange(len(gdf))},
                                   index=gdf.index)
            for c in selection_l:
                codes[c] = gdf[c].copy()
        if buckets and storage_name in buckets:
            na_sentinel = _hash_bucket(gdf,
                                       buckets,
                                       selection_l,
                                       encode_type=encode_type)
        # apply frequency hashing
        if freq_threshold and buckets and storage_name in buckets:
            merged_df = codes.merge(value,
                                    left_on=selection_l,
                                    right_on=selection_r,
                                    how="left").sort_values("order")
            merged_df.reset_index(drop=True, inplace=True)
            max_id = merged_df["labels"].max()
            merged_df["labels"].fillna(cudf.Series(na_sentinel + max_id + 1),
                                       inplace=True)
            labels = merged_df["labels"].values
        # only do hashing
        elif buckets and storage_name in buckets:
            labels = na_sentinel
        # no hashing
        else:
            na_sentinel = 0
            labels = codes.merge(value,
                                 left_on=selection_l,
                                 right_on=selection_r,
                                 how="left").sort_values("order")["labels"]
            labels.fillna(na_sentinel, inplace=True)
            labels = labels.values
    else:
        # Use `searchsorted` if we are using a "full" encoding
        if list_col:
            labels = value[selection_r].searchsorted(
                gdf[selection_l[0]].list.leaves,
                side="left",
                na_position="first")
        else:
            labels = value[selection_r].searchsorted(gdf[selection_l],
                                                     side="left",
                                                     na_position="first")
        labels[labels >= len(value[selection_r])] = na_sentinel

    if list_col:
        labels = _encode_list_column(gdf[selection_l[0]], labels)

    return labels
示例#6
0
def _stratify_split(X, y, n_train, n_test, x_numba, y_numba, random_state):
    """
    Function to perform a stratified split based on y lables.
    Based on scikit-learn stratified split implementation.

    Parameters
    ----------
    X, y: Shuffled input data and labels
    n_train: Number of samples in train set
    n_test: number of samples in test set
    x_numba: Determines whether the data should be converted to numba
    y_numba: Determines whether the labales should be converted to numba

    Returns
    -------
    X_train, X_test: Data X divided into train and test sets
    y_train, y_test: Labels divided into train and test sets
    """
    x_cudf = False
    y_cudf = False

    if isinstance(X, cudf.DataFrame):
        x_cudf = True
    elif hasattr(X, "__cuda_array_interface__"):
        X = cp.asarray(X)
        x_order = _strides_to_order(X.__cuda_array_interface__['strides'],
                                    cp.dtype(X.dtype))

    if isinstance(y, cudf.Series):
        y_cudf = True
    elif hasattr(y, "__cuda_array_interface__"):
        y = cp.asarray(y)
        y_order = _strides_to_order(y.__cuda_array_interface__['strides'],
                                    cp.dtype(y.dtype))
    elif isinstance(y, cudf.DataFrame):
        y_cudf = True
        # ensuring it has just one column
        if y.shape[1] != 1:
            raise ValueError('Expected one label, but found y'
                             'with shape = %d' % (y.shape))

    classes, y_indices = cp.unique(y.values if y_cudf else y,
                                   return_inverse=True)

    n_classes = classes.shape[0]
    class_counts = cp.bincount(y_indices)
    if n_train < n_classes:
        raise ValueError('The train_size = %d should be greater or '
                         'equal to the number of classes = %d' %
                         (n_train, n_classes))
    if n_test < n_classes:
        raise ValueError('The test_size = %d should be greater or '
                         'equal to the number of classes = %d' %
                         (n_test, n_classes))
    class_indices = cp.array_split(cp.argsort(y_indices), n_classes)

    X_train = None

    # random_state won't be None or int, that's handled earlier
    if isinstance(random_state, np.random.RandomState):
        random_state = cp.random.RandomState(seed=random_state.get_state()[1])

    # Break ties
    n_i = _approximate_mode(class_counts, n_train, random_state)
    class_counts_remaining = class_counts - n_i
    t_i = _approximate_mode(class_counts_remaining, n_test, random_state)

    for i in range(n_classes):
        permutation = random_state.permutation(class_counts[i].item())
        perm_indices_class_i = class_indices[i].take(permutation)

        if hasattr(X, "__cuda_array_interface__") or \
           isinstance(X, cupyx.scipy.sparse.csr_matrix):

            X_train_i = cp.array(X[perm_indices_class_i[:n_i[i]]],
                                 order=x_order)
            X_test_i = cp.array(X[perm_indices_class_i[n_i[i]:n_i[i] +
                                                       t_i[i]]],
                                order=x_order)

            y_train_i = cp.array(y[perm_indices_class_i[:n_i[i]]],
                                 order=y_order)
            y_test_i = cp.array(y[perm_indices_class_i[n_i[i]:n_i[i] +
                                                       t_i[i]]],
                                order=y_order)

            if X_train is None:
                X_train = cp.array(X_train_i, order=x_order)
                y_train = cp.array(y_train_i, order=y_order)
                X_test = cp.array(X_test_i, order=x_order)
                y_test = cp.array(y_test_i, order=y_order)
            else:
                X_train = cp.concatenate([X_train, X_train_i], axis=0)
                X_test = cp.concatenate([X_test, X_test_i], axis=0)
                y_train = cp.concatenate([y_train, y_train_i], axis=0)
                y_test = cp.concatenate([y_test, y_test_i], axis=0)

        elif x_cudf:
            X_train_i = X.iloc[perm_indices_class_i[:n_i[i]]]
            X_test_i = X.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]

            y_train_i = y.iloc[perm_indices_class_i[:n_i[i]]]
            y_test_i = y.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]

            if X_train is None:
                X_train = X_train_i
                y_train = y_train_i
                X_test = X_test_i
                y_test = y_test_i
            else:
                X_train = cudf.concat([X_train, X_train_i], ignore_index=False)
                X_test = cudf.concat([X_test, X_test_i], ignore_index=False)
                y_train = cudf.concat([y_train, y_train_i], ignore_index=False)
                y_test = cudf.concat([y_test, y_test_i], ignore_index=False)

    if x_numba:
        X_train = cuda.as_cuda_array(X_train)
        X_test = cuda.as_cuda_array(X_test)
    elif x_cudf:
        X_train = cudf.DataFrame(X_train)
        X_test = cudf.DataFrame(X_test)

    if y_numba:
        y_train = cuda.as_cuda_array(y_train)
        y_test = cuda.as_cuda_array(y_test)
    elif y_cudf:
        y_train = cudf.DataFrame(y_train)
        y_test = cudf.DataFrame(y_test)

    return X_train, X_test, y_train, y_test
示例#7
0
    def find_and_replace(self, to_replace, replacement, all_nan):
        """
        Return col with *to_replace* replaced with *replacement*.
        """

        # create a dataframe containing the pre-replacement categories
        # and a copy of them to work with. The index of this dataframe
        # represents the original ints that map to the categories
        old_cats = cudf.DataFrame()
        old_cats["cats"] = column.as_column(self.dtype.categories)
        new_cats = old_cats.copy(deep=True)

        # Create a column with the appropriate labels replaced
        old_cats["cats_replace"] = old_cats["cats"].replace(
            to_replace, replacement
        )

        # Construct the new categorical labels
        # If a category is being replaced by an existing one, we
        # want to map it to None. If it's totally new, we want to
        # map it to the new label it is to be replaced by
        dtype_replace = cudf.Series(replacement)
        dtype_replace[dtype_replace.isin(old_cats["cats"])] = None
        new_cats["cats"] = new_cats["cats"].replace(to_replace, dtype_replace)

        # anything we mapped to None, we want to now filter out since
        # those categories don't exist anymore
        # Resetting the index creates a column 'index' that associates
        # the original integers to the new labels
        bmask = new_cats["cats"]._column.notna()
        new_cats = cudf.DataFrame(
            {"cats": new_cats["cats"]._column.apply_boolean_mask(bmask)}
        ).reset_index()

        # old_cats contains replaced categories and the ints that
        # previously mapped to those categories and the index of
        # new_cats is a RangeIndex that contains the new ints
        catmap = old_cats.merge(
            new_cats, left_on="cats_replace", right_on="cats", how="inner"
        )

        # The index of this frame is now the old ints, but the column
        # named 'index', which came from the filtered categories,
        # contains the new ints that we need to map to
        to_replace_col = column.as_column(catmap.index).astype(
            self.cat().codes.dtype
        )
        replacement_col = catmap["index"]._column.astype(
            self.cat().codes.dtype
        )

        replaced = column.as_column(self.cat().codes)
        output = libcudf.replace.replace(
            replaced, to_replace_col, replacement_col
        )

        return column.build_categorical_column(
            categories=new_cats["cats"],
            codes=column.as_column(output.base_data, dtype=output.dtype),
            mask=output.base_mask,
            offset=output.offset,
            size=output.size,
            ordered=self.dtype.ordered,
        )
示例#8
0
    def _query6(self):
        self._loadTables('query6')

        self.rideReqTable = self.rideReqTable[
            self.rideReqTable['rideReq.time'] < self.rideReqTable.shape[0] /
            10]

        rideReqIndex = self._createIndex(
            self.rideReqTable,
            'rideReq.start',
        )

        driverStatusIndex = self._createIndex(
            self.driverStatusTable,
            'drvStat.pos',
        )

        locationPolygon = self._createBox(
            self.locationTable,
            'loc.bounds',
        )

        trainX = {}
        for i in range(10):
            trainX['c{}'.format(i)] = np.random.rand(1000)
        trainX = cudf.DataFrame(trainX)

        trainY = np.random.choice([0.0, 1.0], size=1000)
        trainY = cudf.Series(trainY)

        linReg = cuml.LinearRegression()
        linReg.fit(trainX, trainY)

        startTime = time.time()

        (joinRideReq, numbaTime0) = self._spatialJoinDist(
            self.rideReqTable, self.locationTable, 'rideReq.start',
            'loc.bounds', rideReqIndex, locationPolygon, 0.0)

        joinRideReq['count'] = 0
        reqGroup = joinRideReq.groupby(['loc.locationId'], ).agg({
            'count':
            'count',
        }).reset_index()

        (joinDriver, numbaTime1) = self._spatialJoinDist(
            self.driverStatusTable, self.locationTable, 'drvStat.pos',
            'loc.bounds', driverStatusIndex, locationPolygon, 0.0)

        joinDriver['count'] = 0
        driverGroup = joinDriver.groupby(['loc.locationId'], ).agg({
            'count':
            'count',
        }).reset_index()

        join0 = reqGroup.merge(driverGroup, on='loc.locationId')
        join1 = join0.merge(self.locationTable, on='loc.locationId')

        featureName = [
            'loc.c0',
            'loc.c1',
            'loc.c2',
            'loc.c3',
            'loc.c4',
            'loc.c5',
            'loc.c6',
            'loc.c7',
            'loc.c8',
            'loc.c9',
        ]
        join1['infer'] = linReg.predict(join1[featureName])

        endTime = time.time()

        join1.to_csv(
            'query6_gpu.csv',
            index=False,
        )
        return endTime - startTime - numbaTime0 - numbaTime1
示例#9
0
async def test_ucx_localcluster():
    async with LocalCUDACluster(
            protocol="ucx",
            dashboard_address=None,
            n_workers=2,
            threads_per_worker=1,
            processes=True,
            asynchronous=True,
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_nvlink=enable_nvlink,
            enable_infiniband=enable_infiniband,
    ) as cluster:
        async with Client(cluster, asynchronous=True) as client:
            """
            Next, simply call list using an asynchronous Dask client.
            The callback function is pushed to the workers and
            invoked when a message is received with a BlazingMessage
            """

            try:
                ips_ports = await listen_async(callback=mock_msg_callback,
                                               client=client)

                print(str(ips_ports))

                "<<<<<<<<<< Begin Test Logic >>>>>>>>>>>>"

                assert len(ips_ports) == len(
                    client.scheduler_info()["workers"])
                for k, v in ips_ports.items():
                    assert v is not None
                import numpy

                meta = {"worker_ids": tuple(ips_ports.keys())}
                data = cudf.DataFrame({
                    "%s" % x: cudf.Series(np.arange(37000))
                    for x in range(50)
                })
                """
                Loop through each of the workers, sending a test BlazingMessage
                to all other workers.
                """
                for dask_addr, blazing_addr in ips_ports.items():
                    msg = BlazingMessage(meta, data)

                    for n in range(1):

                        async def send(msg):
                            await UCX.get().send(msg)

                        await client.run(send,
                                         msg,
                                         workers=[dask_addr],
                                         wait=True)
                """
                Gather messages received on each worker for validation
                """
                received = await client.run(
                    lambda: get_worker()._test_msgs_received, wait=True)

                assert len(received) == len(ips_ports)

                for worker_addr, msgs in received.items():
                    for msg in msgs:
                        cudf_test.assert_eq(msg.data, data)
                        assert msg.metadata == meta
                    assert len(msgs) == len(ips_ports)
            finally:

                print("Cleaning up")
                await cleanup(client)
示例#10
0
def symmetrize_df(df, src_name, dst_name, multi=False, symmetrize=True):
    """
    Take a COO stored in a DataFrame, along with the column names of
    the source and destination columns and create a new data frame
    using the same column names that symmetrize the graph so that all
    edges appear in both directions.
    Note that if other columns exist in the data frame (e.g. edge weights)
    the other columns will also be replicated.  That is, if (u,v,data)
    represents the source value (u), destination value (v) and some
    set of other columns (data) in the input data, then the output
    data will contain both (u,v,data) and (v,u,data) with matching
    data.
    If (u,v,data1) and (v,u,data2) exist in the input data where data1
    != data2 then this code will arbitrarily pick the smaller data
    element to keep, if this is not desired then the caller should
    should correct the data prior to calling symmetrize.

    Parameters
    ----------
    df : cudf.DataFrame
        Input data frame containing COO.  Columns should contain source
        ids, destination ids and any properties associated with the
        edges.
    src_name : string
        Name of the column in the data frame containing the source ids
    dst_name : string
        Name of the column in the data frame containing the destination ids
    multi : bool
        Set to True if graph is a Multi(Di)Graph. This allows multiple
        edges instead of dropping them.
    symmetrize : bool
        Default is True to perform symmetrization. If False only duplicate
        edges are dropped.

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'weight'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> sym_ddf = cugraph.symmetrize_ddf(ddf, "src", "dst", "weight")
    >>> Comms.destroy()
    """
    #
    #  Now append the columns.  We add sources to the end of destinations,
    #  and destinations to the end of sources.  Otherwise we append a
    #  column onto itself.
    #
    if symmetrize:
        gdf = cudf.DataFrame()
        for idx, name in enumerate(df.columns):
            if name == src_name:
                gdf[src_name] = df[src_name].append(df[dst_name],
                                                    ignore_index=True)
            elif name == dst_name:
                gdf[dst_name] = df[dst_name].append(df[src_name],
                                                    ignore_index=True)
            else:
                gdf[name] = df[name].append(df[name], ignore_index=True)
    else:
        gdf = df
    if multi:
        return gdf
    else:
        return gdf.groupby(by=[src_name, dst_name], as_index=False).min()
示例#11
0
文件: test_bfs.py 项目: mattf/cugraph
def convert_output_to_cudf(input_G_or_matrix, cugraph_result):
    """
    Convert cugraph_result to a cudf DataFrame. The conversion is based on the
    type of input_G_or_matrix, since different input types result in different
    cugraph_result types (see cugraph_input_output_map).
    """
    input_type = type(input_G_or_matrix)
    expected_return_type = cuGraph_input_output_map[type(input_G_or_matrix)]
    assert type(cugraph_result) is expected_return_type

    if expected_return_type is cudf.DataFrame:
        return cugraph_result

    elif expected_return_type is pd.DataFrame:
        return cudf.from_pandas(cugraph_result)

    # A CuPy/SciPy input means the return value will be a 2-tuple of:
    #   distance: cupy.ndarray
    #      ndarray of shortest distances between source and vertex.
    #   predecessor: cupy.ndarray
    #      ndarray of predecessors of a vertex on the path from source, which
    #      can be used to reconstruct the shortest paths.
    # or a 3-tuple of the above 2 plus
    #   sp_counter: cupy.ndarray
    #      for the i'th position in the array, the number of shortest paths
    #      leading to the vertex at position i in the (input) vertex array.
    elif expected_return_type is tuple:
        if input_type in cupy_types:
            assert type(cugraph_result[0]) is cp.ndarray
            assert type(cugraph_result[1]) is cp.ndarray
            if len(cugraph_result) == 3:
                assert type(cugraph_result[2]) is cp.ndarray
        else:
            assert type(cugraph_result[0]) is np.ndarray
            assert type(cugraph_result[1]) is np.ndarray
            if len(cugraph_result) == 3:
                assert type(cugraph_result[2]) is np.ndarray

        # Get unique verts from input since they are not incuded in output
        if type(input_G_or_matrix) in [
                cp_csr_matrix, cp_csc_matrix, sp_csr_matrix, sp_csc_matrix
        ]:
            coo = input_G_or_matrix.tocoo(copy=False)
        else:
            coo = input_G_or_matrix
        verts = sorted(
            set([n.item() for n in coo.col] + [n.item() for n in coo.row]))
        dists = [n.item() for n in cugraph_result[0]]
        preds = [n.item() for n in cugraph_result[1]]
        assert len(verts) == len(dists) == len(preds)

        d = {"vertex": verts, "distance": dists, "predecessor": preds}

        if len(cugraph_result) == 3:
            counters = [n.item() for n in cugraph_result[2]]
            assert len(counters) == len(verts)
            d.update({"sp_counter": counters})

        return cudf.DataFrame(d)

    else:
        raise RuntimeError(f"unsupported return type: {expected_return_type}")
示例#12
0
def symmetrize(source_col,
               dest_col,
               value_col=None,
               multi=False,
               symmetrize=True):
    """
    Take a COO set of source destination pairs along with associated values
    stored in a single GPU or distributed
    create a new COO set of source destination pairs along with values where
    all edges exist in both directions.

    Return from this call will be a COO stored as two cudf Series or
    dask_cudf.Series -the symmetrized source column and the symmetrized dest
    column, along with
    an optional cudf Series containing the associated values (only if the
    values are passed in).

    Parameters
    ----------
    source_col : cudf.Series or dask_cudf.Series
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains the source index for each edge.
        Source indices must be an integer type.
    dest_col : cudf.Series or dask_cudf.Series
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains the destination index for each edge.
        Destination indices must be an integer type.
    value_col : cudf.Series or dask_cudf.Series (optional)
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains values associated with this edge.
        For this function the values can be any type, they are not
        examined, just copied.

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> sources = cudf.Series(M['0'])
    >>> destinations = cudf.Series(M['1'])
    >>> values = cudf.Series(M['2'])
    >>> src, dst, val = cugraph.symmetrize(sources, destinations, values)
    """

    input_df = None
    weight_name = None
    if type(source_col) is dask_cudf.Series:
        # FIXME convoluted way of just wrapping dask cudf Series in a ddf
        input_df = source_col.to_frame()
        input_df = input_df.rename(columns={source_col.name: "source"})
        input_df["destination"] = dest_col
    else:
        input_df = cudf.DataFrame({
            "source": source_col,
            "destination": dest_col
        })
        csg.null_check(source_col)
        csg.null_check(dest_col)
    if value_col is not None:
        if isinstance(value_col, cudf.Series):
            weight_name = "value"
            input_df.insert(len(input_df.columns), "value", value_col)
        elif isinstance(value_col, cudf.DataFrame):
            input_df = cudf.concat([input_df, value_col], axis=1)

    output_df = None
    if type(source_col) is dask_cudf.Series:
        output_df = symmetrize_ddf(input_df, "source", "destination",
                                   weight_name).persist()
    else:
        output_df = symmetrize_df(input_df, "source", "destination", multi,
                                  symmetrize)
    if value_col is not None:
        if isinstance(value_col, cudf.Series):
            return (
                output_df["source"],
                output_df["destination"],
                output_df["value"],
            )
        elif isinstance(value_col, cudf.DataFrame):
            return (
                output_df["source"],
                output_df["destination"],
                output_df[value_col.columns],
            )
    return output_df["source"], output_df["destination"]
示例#13
0
async def test_cuda_backend():
    import cupy
    import cudf

    params, teardown_params = await CudaStorage.setup()
    storage = CudaStorage(**params)
    assert storage.level == StorageLevel.GPU

    data1 = cupy.asarray(np.random.rand(10, 10))
    put_info1 = await storage.put(data1)
    get_data1 = await storage.get(put_info1.object_id)
    cupy.testing.assert_array_equal(data1, get_data1)

    info1 = await storage.object_info(put_info1.object_id)
    assert info1.size == put_info1.size

    await storage.delete(put_info1.object_id)

    data2 = cudf.DataFrame(
        pd.DataFrame(
            {
                'col1': np.arange(10),
                'col2': [f'str{i}' for i in range(10)],
                'col3': np.random.rand(10)
            }, ))
    put_info2 = await storage.put(data2)
    get_data2 = await storage.get(put_info2.object_id)
    cudf.testing.assert_frame_equal(data2, get_data2)

    info2 = await storage.object_info(put_info2.object_id)
    assert info2.size == put_info2.size

    await CudaStorage.teardown(**teardown_params)

    # test writer and reader
    t = np.random.random(10)
    buffers = await AioSerializer(t).run()
    size = sum(getattr(buf, 'nbytes', len(buf)) for buf in buffers)
    async with await storage.open_writer(size=size) as writer:
        for buf in buffers:
            await writer.write(buf)

    async with await storage.open_reader(writer.object_id) as reader:
        content = await reader.read()
        b = content.to_host_array().tobytes()
        t2 = await AioDeserializer(io.BytesIO(b)).run()
    np.testing.assert_array_equal(t, t2)

    # write cupy array
    t = cupy.random.random((10, ))
    headers, buffers = serialize(t)
    async with await storage.open_writer(size=len(b)) as writer:
        for buffer in buffers:
            await writer.write(buffer.data)

    async with await storage.open_reader(writer.object_id) as reader:
        b2 = await reader.read()
        t2 = deserialize(headers, [b2])

    cupy.testing.assert_array_equal(t, t2)

    await CudaStorage.teardown(**teardown_params)
示例#14
0
def test_categorify_freq_limit(tmpdir, freq_limit, buckets, search_sort):
    df = cudf.DataFrame({
        "Author": [
            "User_A",
            "User_E",
            "User_B",
            "User_C",
            "User_A",
            "User_E",
            "User_B",
            "User_C",
            "User_B",
            "User_C",
        ],
        "Engaging User": [
            "User_B",
            "User_B",
            "User_A",
            "User_D",
            "User_B",
            "User_c",
            "User_A",
            "User_D",
            "User_D",
            "User_D",
        ],
    })

    isfreqthr = (isinstance(freq_limit, int)
                 and freq_limit > 0) or (isinstance(freq_limit, dict))

    if (not search_sort and isfreqthr) or (search_sort and not isfreqthr):
        cat_names = ["Author", "Engaging User"]

        cats = cat_names >> ops.Categorify(
            freq_threshold=freq_limit,
            out_path=str(tmpdir),
            search_sorted=search_sort,
            num_buckets=buckets,
        )

        workflow = nvt.Workflow(cats)
        df_out = workflow.fit_transform(
            nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

        if freq_limit and not buckets:
            # Column combinations are encoded
            if isinstance(freq_limit, dict):
                assert df_out["Author"].max() == 2
                assert df_out["Engaging User"].max() == 1
            else:
                assert len(df["Author"].unique()) == df_out["Author"].max()
                assert len(df["Engaging User"].unique()
                           ) == df_out["Engaging User"].max()
        elif not freq_limit and buckets:
            if isinstance(buckets, dict):
                assert df_out["Author"].max() <= 9
                assert df_out["Engaging User"].max() <= 19
            else:
                assert df_out["Author"].max() <= 9
                assert df_out["Engaging User"].max() <= 9
        elif freq_limit and buckets:
            if isinstance(buckets, dict) and isinstance(buckets, dict):
                assert (
                    df_out["Author"].max() <=
                    (df["Author"].hash_values() % buckets["Author"]).max() +
                    2 + 1)
                assert (df_out["Engaging User"].max() <=
                        (df["Engaging User"].hash_values() %
                         buckets["Engaging User"]).max() + 1 + 1)
示例#15
0
文件: io.py 项目: ayushdg/GPU-GWAS
def _transform_df(df, sample_key_cols, common_key_cols, common_cols, drop_cols):
    """
    Inputs
    ------

    df: pd.DataFrame
        A pandas datafarme read from a vcf file using variantworks.io.vcfio.VCFReader
    sample_key_cols: list
                     List of `sample_variant` columns in the df
    common_key_cols: list
        List of common_variants columns across all samples at a location
    drop_cols : list
        Columns to drop
    Returns
    -------

    A cuDF dataframe modified to
    """
    sample_key_cols = list(set(sample_key_cols) - set(drop_cols))
    common_key_cols = list(set(common_key_cols) - set(drop_cols))
    common_cols = list(set(common_cols) - set(drop_cols))

    df2 = df.drop(columns=drop_cols)
    df2 = df2[sample_key_cols].transpose()
    df2.reset_index(inplace=True)
    pid_attr_split = df2["index"].str.split("_", expand=True)
    pid_attr_split.columns = ["sample", "key"]
    pid_attr_split["key"] = "call_" + pid_attr_split["key"]
    df2 = pd.concat([df2, pid_attr_split], axis=1)
    df2.drop(columns="index", axis=1, inplace=True)

    temp = pd.DataFrame(pid_attr_split["sample"].unique())
    unique_samples = len(temp)
    temp.columns = ["sample"]
    temp = temp.loc[temp.index.repeat(len(common_key_cols))]
    temp = temp.reset_index(drop=True)

    temp2 = df[common_key_cols].transpose().astype("float64")
    temp2["key"] = temp2.index

    temp2 = pd.concat([temp2] * unique_samples, axis=0)
    temp2 = temp2.reset_index(drop=True)
    temp = pd.concat([temp2, temp], axis=1)
    del temp2

    df2 = pd.concat([df2, temp], axis=0)
    del temp

    res_df = pd.melt(
        df2,
        id_vars=["sample", "key"],
        value_vars=df2.columns[:-2],
        var_name="location",
    )
    del df2
    gdf1 = cudf.DataFrame(res_df)
    gdf2 = cudf.DataFrame(df[common_cols])
    gdf1 = gdf1.merge(gdf2, how="left", left_on="location", right_index=True)

    del gdf2

    gdf1 = gdf1.astype({"ref": "int8", "alt": "int8"})
    gdf1 = gdf1[["chrom", "start_pos", "ref", "alt", "sample", "key", "value"]]
    gdf1 = gdf1.pivot(
        index=["chrom", "start_pos", "ref", "alt", "sample"],
        columns=["key"],
        values=["value"],
    ).reset_index()

    col_list = [i[1] if i[0] == "value" else i[0] for i in list(gdf1.columns)]
    gdf1.columns = col_list
    gdf1.rename(columns={"start_pos": "pos"}, inplace=True)
    return gdf1
示例#16
0
def main(client, config):
    import dask_cudf
    import cudf

    item_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    wcs_tstamp_min = get_wcs_minima(config)

    item_df["i_item_sk"] = item_df["i_item_sk"].astype("int32")
    item_df["i_category_id"] = item_df["i_category_id"].astype("int8")

    # we eventually will only care about these categories, so we can filter now
    item_df_filtered = item_df.loc[
        item_df.i_category_id.isin(q03_purchased_item_category_IN)
    ].reset_index(drop=True)

    # The main idea is that we don't fuse a filtration task with reading task yet
    # this causes more memory pressures as we try to read the whole thing ( and spill that)
    # at once and then do filtration .

    web_clickstream_flist = glob.glob(os.path.join(config["data_dir"], "web_clickstreams/*.parquet"))
    task_ls = [
        delayed(pre_repartition_task)(fn, item_df.to_delayed()[0], wcs_tstamp_min)
        for fn in web_clickstream_flist
    ]

    meta_d = {
        "wcs_user_sk": np.ones(1, dtype=np.int32),
        "tstamp": np.ones(1, dtype=np.int32),
        "wcs_item_sk": np.ones(1, dtype=np.int32),
        "wcs_sales_sk": np.ones(1, dtype=np.int32),
        "i_category_id": np.ones(1, dtype=np.int8),
    }
    meta_df = cudf.DataFrame(meta_d)

    merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df)

    merged_df = merged_df.shuffle(on="wcs_user_sk")

    meta_d = {
        "i_item_sk": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype),
        "cnt": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype),
    }
    meta_df = cudf.DataFrame(meta_d)

    grouped_df = merged_df.map_partitions(
        reduction_function, item_df_filtered.to_delayed()[0], meta=meta_df
    )

    ### todo: check if this has any impact on stability
    grouped_df = grouped_df.persist(priority=10000)
    ### todo: remove this later after more testing
    wait(grouped_df)
    print("---" * 20)
    print("grouping complete ={}".format(len(grouped_df)))
    grouped_df = grouped_df.groupby(["i_item_sk"]).sum(split_every=2).reset_index()
    grouped_df.columns = ["i_item_sk", "cnt"]
    result_df = grouped_df.map_partitions(
        lambda df: df.sort_values(by=["cnt"], ascending=False)
    )

    result_df.columns = ["lastviewed_item", "cnt"]
    result_df["purchased_item"] = q03_purchased_item_IN
    cols_order = ["purchased_item", "lastviewed_item", "cnt"]
    result_df = result_df[cols_order]
    result_df = result_df.persist()
    ### todo: remove this later after more testing
    wait(result_df)
    print(len(result_df))
    result_df = result_df.head(q03_limit)
    print("result complete")
    print("---" * 20)
    return result_df
示例#17
0
文件: io.py 项目: ayushdg/GPU-GWAS
def load_vcf(vcf_file, info_keys=[], format_keys=[]):
    """Function to load VCF into gwas dataframe."""
    # Load VCF file using pysam
    reader = pysam.VariantFile(vcf_file)

    if "*" in info_keys:
        header_dict = dict(reader.header.info)
        new_keys = []
        for k in header_dict.keys():
            new_keys.append(k)
        info_keys = new_keys
    if "*" in format_keys:
        header_dict = dict(reader.header.formats)
        new_keys = []
        for k in header_dict.keys():
            new_keys.append(k)
        format_keys = new_keys

    print(info_keys)
    info_keys = set(info_keys)
    print(format_keys)
    format_keys = set(format_keys)

    df_dict = defaultdict(list)
    for record in reader:
        if len(record.alts) != 1:
            continue
        if record.ref not in nucleotide_dict or record.alts[0] not in nucleotide_dict:
            continue

        # Run through all variants and all their keys in format
        for sample in record.samples:
            format_dict = dict(record.samples[sample])
            for key, value in format_dict.items():
                if key not in format_keys:
                    continue
                # _add_basic_component(record, sample, df_dict)
                if key == "GT":
                    if None in list(value):
                        value = -1
                    else:
                        value = sum(list(value))
                _add_key_value(record, sample, f"call_{key}", value, df_dict)

            # Run through all variants and all their info keys
            info_dict = dict(record.info)
            for key, value in info_dict.items():
                if key not in info_keys:
                    continue
                # _add_basic_component(record, sample, df_dict)
                _add_key_value(record, sample, key, value, df_dict)

    df = pd.DataFrame.from_dict(df_dict)
    df, feature_mapping = _create_numerical_features(df)
    df = df.pivot_table(
        index=["chrom", "pos", "ref", "alt", "sample", "quality", "feature_id"],
        columns="key",
        values="value",
    ).reset_index()
    cuda_df = cudf.DataFrame(df)
    return cuda_df, feature_mapping
示例#18
0
print(len(test))
if len(test) > 3:
    COMPUTE_CV = False
else:
    print('this submission notebook will compute CV score, but commit notebook will not')

train = pd.read_csv('./shopee-product-matching/train.csv')
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)
print('train shape is', train.shape)
train.head()


if COMPUTE_CV:
    test = pd.read_csv('./shopee-product-matching/train.csv')
    test_gf = cudf.DataFrame(test)
    print('Using train as test to compute CV (since commit notebook). Shape is', test_gf.shape)
else:
    test = pd.read_csv('./shopee-product-matching/test.csv')
    test_gf = cudf.read_csv('./shopee-product-matching/test.csv')
    print('Test shape is', test_gf.shape)
test_gf.head()



def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score
示例#19
0
文件: scc.py 项目: rapidsai/cugraph
def strong_connected_component(source, destination):
    """
    Generate the strongly connected components
    using the FW-BW-TRIM approach, but skipping the trimming)

    Parameters
    ----------
    source : cudf.Series
        A cudf series that contains the source side of an edge list

    destination : cudf.Series
        A cudf series that contains the destination side of an edge list

    Returns
    -------
    cdf : cudf.DataFrame - a dataframe for components
        df['vertex']   - the vertex ID
        df['id']       - the component ID

    sdf : cudf.DataFrame - a dataframe with single vertex components
        df['vertex']   - the vertex ID

    count - int - the number of components found


    Examples
    --------
    >>> # M = read_mtx_file(graph_file)
    >>> # sources = cudf.Series(M.row)
    >>> # destinations = cudf.Series(M.col)
    >>> # components, single_components, count =
    >>> #   cugraph.strong_connected_component(source, destination)

    """
    # FIXME: Uncomment out the above example
    max_value = np.iinfo(np.int32).max  # NOQA

    # create the FW and BW graphs - this version dopes nopt modify the graphs
    G_fw = cugraph.Graph()
    G_bw = cugraph.Graph()

    G_fw.add_edge_list(source, destination)
    G_bw.add_edge_list(destination, source)

    # get a list of vertices and sort the list on out_degree
    d = G_fw.degrees()
    d = d.sort_values(by='out_degree', ascending=False)

    num_verts = len(d)

    # create space for the answers
    components = [None] * num_verts
    single_components = [None] * num_verts

    # Counts - aka array indexies
    count = 0
    single_count = 0

    # remove vertices that cannot be in a component
    bad = d.query('in_degree == 0 or out_degree == 0')

    if len(bad):
        bad = bad.drop(['in_degree', 'out_degree'])

        single_components[single_count] = bad
        single_count = single_count + 1
        d = _filter_list(d, bad)

    # ----- Start processing -----
    while len(d) > 0:

        v = d['vertex'][0]

        # compute the forward BFS
        bfs_fw = cugraph.bfs(G_fw, v)
        bfs_fw = bfs_fw.query("distance != @max_value")

        # Now backwards
        bfs_bw = cugraph.bfs(G_bw, v)
        bfs_bw = bfs_bw.query("distance != @max_value")

        # intersection
        common = bfs_fw.merge(bfs_bw, on='vertex', how='inner')

        if len(common) > 1:
            common['id'] = v
            components[count] = common
            d = _filter_list(d, common)
            count = count + 1

        else:
            # v is an isolated vertex
            vdf = cudf.DataFrame()
            vdf['vertex'] = v

            single_components[single_count] = vdf
            single_count = single_count + 1
            d = d.iloc[1:]

    # end of loop until vertex queue is empty

    comp = _compress_array(components, count)
    sing = _compress_array(single_components, single_count)

    return comp, sing, count
示例#20
0
def concat(objs, axis=0, ignore_index=False, sort=None):
    """Concatenate DataFrames, Series, or Indices row-wise.

    Parameters
    ----------
    objs : list of DataFrame, Series, or Index
    axis : {0/'index', 1/'columns'}, default 0
        The axis to concatenate along.
    ignore_index : bool, default False
        Set True to ignore the index of the *objs* and provide a
        default range index instead.
    sort : bool, default False
        Sort non-concatenation axis if it is not already aligned.

    Returns
    -------
    A new object of like type with rows from each object in ``objs``.

    Examples
    --------
    Combine two ``Series``.

    >>> import cudf
    >>> s1 = cudf.Series(['a', 'b'])
    >>> s2 = cudf.Series(['c', 'd'])
    >>> s1
    0    a
    1    b
    dtype: object
    >>> s2
    0    c
    1    d
    dtype: object
    >>> cudf.concat([s1, s2])
    0    a
    1    b
    0    c
    1    d
    dtype: object

    Clear the existing index and reset it in the
    result by setting the ``ignore_index`` option to ``True``.

    >>> cudf.concat([s1, s2], ignore_index=True)
    0    a
    1    b
    2    c
    3    d
    dtype: object

    Combine two DataFrame objects with identical columns.

    >>> df1 = cudf.DataFrame([['a', 1], ['b', 2]],
    ...                    columns=['letter', 'number'])
    >>> df1
      letter  number
    0      a       1
    1      b       2
    >>> df2 = cudf.DataFrame([['c', 3], ['d', 4]],
    ...                    columns=['letter', 'number'])
    >>> df2
      letter  number
    0      c       3
    1      d       4
    >>> cudf.concat([df1, df2])
      letter  number
    0      a       1
    1      b       2
    0      c       3
    1      d       4

    Combine DataFrame objects with overlapping columns and return
    everything. Columns outside the intersection will
    be filled with ``null`` values.

    >>> df3 = cudf.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
    ...                    columns=['letter', 'number', 'animal'])
    >>> df3
    letter  number animal
    0      c       3    cat
    1      d       4    dog
    >>> cudf.concat([df1, df3], sort=False)
      letter  number animal
    0      a       1   None
    1      b       2   None
    0      c       3    cat
    1      d       4    dog

    Combine ``DataFrame`` objects horizontally along the
    x axis by passing in ``axis=1``.

    >>> df4 = cudf.DataFrame([['bird', 'polly'], ['monkey', 'george']],
    ...                    columns=['animal', 'name'])
    >>> df4
       animal    name
    0    bird   polly
    1  monkey  george
    >>> cudf.concat([df1, df4], axis=1)
      letter  number  animal    name
    0      a       1    bird   polly
    1      b       2  monkey  george
    """

    if not objs:
        raise ValueError("No objects to concatenate")

    objs = [obj for obj in objs if obj is not None]

    # Return for single object
    if len(objs) == 1:
        if ignore_index:
            result = cudf.DataFrame(
                data=objs[0]._data.copy(deep=True),
                index=cudf.RangeIndex(len(objs[0])),
            )
        else:
            result = objs[0].copy()
        return result

    if len(objs) == 0:
        raise ValueError("All objects passed were None")

    # Retrieve the base types of `objs`. In order to support sub-types
    # and object wrappers, we use `isinstance()` instead of comparing
    # types directly
    typs = set()
    for o in objs:
        if isinstance(o, cudf.MultiIndex):
            typs.add(cudf.MultiIndex)
        if issubclass(type(o), Index):
            typs.add(type(o))
        elif isinstance(o, DataFrame):
            typs.add(DataFrame)
        elif isinstance(o, Series):
            typs.add(Series)
        else:
            raise ValueError(f"cannot concatenate object of type {type(o)}")

    allowed_typs = {Series, DataFrame}

    param_axis = _axis_map.get(axis, None)
    if param_axis is None:
        raise ValueError(
            '`axis` must be 0 / "index" or 1 / "columns", got: {0}'.format(
                param_axis
            )
        )
    else:
        axis = param_axis

    # when axis is 1 (column) we can concat with Series and Dataframes
    if axis == 1:

        assert typs.issubset(allowed_typs)
        df = DataFrame()
        _normalize_series_and_dataframe(objs, axis=axis)

        objs, match_index = _align_objs(objs)

        for idx, o in enumerate(objs):
            if not ignore_index and idx == 0:
                df.index = o.index
            for col in o._data.names:
                if col in df._data:
                    raise NotImplementedError(
                        "A Column with duplicate name found: {0}, cuDF\
                        doesn't support having multiple columns with\
                        same names yet.".format(
                            col
                        )
                    )
                df[col] = o._data[col]

        result_columns = objs[0].columns
        for o in objs[1:]:
            result_columns = result_columns.append(o.columns)

        df.columns = result_columns.unique()
        if ignore_index:
            df.index = None
            return df
        elif not match_index:
            return df.sort_index()
        else:
            return df

    typ = list(typs)[0]

    if len(typs) > 1:
        if allowed_typs == typs:
            # This block of code will run when `objs` has
            # both Series & DataFrame kind of inputs.
            _normalize_series_and_dataframe(objs, axis=axis)
            typ = DataFrame
        else:
            raise ValueError(
                "`concat` cannot concatenate objects of "
                "types: %r." % sorted([t.__name__ for t in typs])
            )

    if typ is DataFrame:
        objs = [obj for obj in objs if obj.shape != (0, 0)]
        if len(objs) == 0:
            # If objs is empty, that indicates all of
            # objs are empty dataframes.
            return cudf.DataFrame()
        elif len(objs) == 1:
            if ignore_index:
                result = cudf.DataFrame(
                    data=objs[0]._data.copy(deep=True),
                    index=cudf.RangeIndex(len(objs[0])),
                )
            else:
                result = objs[0].copy()
            return result
        else:
            return DataFrame._concat(
                objs, axis=axis, ignore_index=ignore_index, sort=sort
            )
    elif typ is Series:
        return Series._concat(
            objs, axis=axis, index=None if ignore_index else True
        )
    elif typ is cudf.MultiIndex:
        return cudf.MultiIndex._concat(objs)
    elif issubclass(typ, Index):
        return Index._concat(objs)
    else:
        raise ValueError(f"cannot concatenate object of type {typ}")
示例#21
0
def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match):
    # Setting a seed that triggers max amount of comm in the two-GPU case.
    cupy.random.seed(17561648246761420848)

    chunk_type = chunk_type or "build"
    frac_match = frac_match or 1.0
    if chunk_type == "build":
        # Build dataframe
        #
        # "key" column is a unique sample within [0, local_size * num_chunks)
        #
        # "shuffle" column is a random selection of partitions (used for shuffle)
        #
        # "payload" column is a random permutation of the chunk_size

        start = local_size * i_chunk
        stop = start + local_size

        parts_array = cupy.arange(num_chunks, dtype="int64")
        suffle_array = cupy.repeat(parts_array, math.ceil(local_size / num_chunks))

        df = cudf.DataFrame(
            {
                "key": cupy.arange(start, stop=stop, dtype="int64"),
                "shuffle": cupy.random.permutation(suffle_array)[:local_size],
                "payload": cupy.random.permutation(
                    cupy.arange(local_size, dtype="int64")
                ),
            }
        )
    else:
        # Other dataframe
        #
        # "key" column matches values from the build dataframe
        # for a fraction (`frac_match`) of the entries. The matching
        # entries are perfectly balanced across each partition of the
        # "base" dataframe.
        #
        # "payload" column is a random permutation of the chunk_size

        # Step 1. Choose values that DO match
        sub_local_size = local_size // num_chunks
        sub_local_size_use = max(int(sub_local_size * frac_match), 1)
        arrays = []
        for i in range(num_chunks):
            bgn = (local_size * i) + (sub_local_size * i_chunk)
            end = bgn + sub_local_size
            ar = cupy.arange(bgn, stop=end, dtype="int64")
            arrays.append(cupy.random.permutation(ar)[:sub_local_size_use])
        key_array_match = cupy.concatenate(tuple(arrays), axis=0)

        # Step 2. Add values that DON'T match
        missing_size = local_size - key_array_match.shape[0]
        start = local_size * num_chunks + local_size * i_chunk
        stop = start + missing_size
        key_array_no_match = cupy.arange(start, stop=stop, dtype="int64")

        # Step 3. Combine and create the final dataframe chunk (dask_cudf partition)
        key_array_combine = cupy.concatenate(
            (key_array_match, key_array_no_match), axis=0
        )
        df = cudf.DataFrame(
            {
                "key": cupy.random.permutation(key_array_combine),
                "payload": cupy.random.permutation(
                    cupy.arange(local_size, dtype="int64")
                ),
            }
        )
    return df
示例#22
0
def parseHiveMetadataFor(curr_table, file_subset, partitions):
    metadata = {}
    names = []
    n_cols = len(curr_table.input.columns)
    dtypes = curr_table.input.dtypes
    columns = curr_table.input.columns
    n_files = len(file_subset)
    col_indexes = {}
    for index in range(n_cols):
        col_name = columns[index]
        names.append('min_' + str(index) + '_' + col_name)
        names.append('max_' + str(index) + '_' + col_name)
        col_indexes[col_name] = index

    names.append('file_handle_index')
    names.append('row_group_index')
    minmax_metadata_table = [[] for _ in range(2 * n_cols + 2)]
    table_partition = {}
    for file_index, partition_name in enumerate(partitions):
        curr_table = partitions[partition_name]
        for col_name, col_value_id in curr_table:
            table_partition.setdefault(col_name, []).append(col_value_id)
        minmax_metadata_table[len(minmax_metadata_table) -
                              2].append(file_index)
        minmax_metadata_table[len(minmax_metadata_table) - 1].append(0)

    for index in range(n_cols):
        col_name = columns[index]
        if col_name in table_partition:
            col_value_ids = table_partition[col_name]
            index = col_indexes[col_name]
            minmax_metadata_table[2 * index] = col_value_ids
            minmax_metadata_table[2 * index + 1] = col_value_ids
        else:
            if dtypes[col_name] == np.object or dtypes[col_name] == np.dtype(
                    'datetime64[ms]') or dtypes[col_name] == np.datetime64:
                return cudf.DataFrame({})
            minmax_metadata_table[2 *
                                  index] = [np.iinfo(dtypes[col_name]).min
                                            ] * n_files
            minmax_metadata_table[2 * index +
                                  1] = [np.iinfo(dtypes[col_name]).max
                                        ] * n_files

    series = []
    for index in range(n_cols):
        col_name = columns[index]
        col1 = pd.Series(minmax_metadata_table[2 * index],
                         dtype=dtypes[col_name],
                         name=names[2 * index])
        col2 = pd.Series(minmax_metadata_table[2 * index + 1],
                         dtype=dtypes[col_name],
                         name=names[2 * index + 1])
        series.append(col1)
        series.append(col2)
    index = n_cols

    col1 = pd.Series(minmax_metadata_table[2 * index],
                     dtype=dtypes[col_name],
                     name=names[2 * index])
    col2 = pd.Series(minmax_metadata_table[2 * index + 1],
                     dtype=dtypes[col_name],
                     name=names[2 * index + 1])
    series.append(col1)
    series.append(col2)

    frame = OrderedDict(((key, value) for (key, value) in zip(names, series)))
    metadata = cudf.DataFrame(frame)
    return metadata
示例#23
0
def _top_level_groupby(gdf, cat_col_groups, tree_width, cont_cols, agg_list,
                       on_host, concat_groups, name_sep):
    sum_sq = "std" in agg_list or "var" in agg_list
    calculate_min = "min" in agg_list
    calculate_max = "max" in agg_list

    # Top-level operation for category-based groupby aggregations
    output = {}
    k = 0
    for i, cat_col_group in enumerate(cat_col_groups):
        if isinstance(cat_col_group, tuple):
            cat_col_group = list(cat_col_group)

        if isinstance(cat_col_group, str):
            cat_col_group = [cat_col_group]
        cat_col_group_str = _make_name(*cat_col_group, sep=name_sep)

        if concat_groups and len(cat_col_group) > 1:
            # Concatenate columns and replace cat_col_group
            # with the single name
            df_gb = cudf.DataFrame()
            ignore_index = True
            df_gb[cat_col_group_str] = _concat(
                [gdf[col] for col in cat_col_group], ignore_index)
            cat_col_group = [cat_col_group_str]
        else:
            # Compile aggregation dictionary and add "squared-sum"
            # column(s) (necessary when `cont_cols` is non-empty)
            df_gb = gdf[cat_col_group + cont_cols].copy(deep=False)

        agg_dict = {}
        agg_dict[cat_col_group[0]] = ["count"]
        for col in cont_cols:
            agg_dict[col] = ["sum"]
            if sum_sq:
                name = _make_name(col, "pow2", sep=name_sep)
                df_gb[name] = df_gb[col].pow(2)
                agg_dict[name] = ["sum"]

            if calculate_min:
                agg_dict[col].append("min")
            if calculate_max:
                agg_dict[col].append("max")

        # Perform groupby and flatten column index
        # (flattening provides better cudf support)
        if _is_list_col(cat_col_group, df_gb):
            # handle list columns by encoding the list values
            df_gb = cudf.DataFrame(
                {cat_col_group[0]: df_gb[cat_col_group[0]].list.leaves})

        gb = df_gb.groupby(cat_col_group, dropna=False).agg(agg_dict)
        gb.columns = [
            _make_name(*(tuple(cat_col_group) + name[1:]), sep=name_sep)
            if name[0] == cat_col_group[0] else _make_name(
                *(tuple(cat_col_group) + name), sep=name_sep)
            for name in gb.columns.to_flat_index()
        ]
        gb.reset_index(inplace=True, drop=False)
        del df_gb

        # Split the result by the hash value of the categorical column
        for j, split in enumerate(
                gb.partition_by_hash(cat_col_group,
                                     tree_width[cat_col_group_str],
                                     keep_index=False)):
            if on_host:
                output[k] = split.to_arrow(preserve_index=False)
            else:
                output[k] = split
            k += 1
        del gb
    return output
def empty_dataframe():
    import cudf

    return cudf.DataFrame({"a": [1.0], "b": [1.0]}).head(0)
示例#25
0
文件: graph.py 项目: orrrrtem/cugraph
    def view_edge_list(self):
        """
        Display the edge list. Compute it if needed.

        NOTE: If the graph is of type Graph() then the displayed undirected
        edges are the same as displayed by networkx Graph(), but the direction
        could be different i.e. an edge displayed by cugraph as (src, dst)
        could be displayed as (dst, src) by networkx.

        cugraph.Graph stores symmetrized edgelist internally. For displaying
        undirected edgelist for a Graph the upper trianglar matrix of the
        symmetrized edgelist is returned.

        networkx.Graph renumbers the input and stores the upper triangle of
        this renumbered input. Since the internal renumbering of networx and
        cugraph is different, the upper triangular matrix of networkx
        renumbered input may not be the same as cugraph's upper trianglar
        matrix of the symmetrized edgelist. Hence the displayed source and
        destination pairs in both will represent the same edge but node values
        could be swapped.

        Returns
        -------
        edgelist_df : cudf.DataFrame
            This cudf.DataFrame wraps source, destination and weight
            gdf_column of size E (E: number of edges)
            The 'src' column contains the source index for each edge.
            Source indices are in the range [0, V) (V: number of vertices).
            The 'dst' column contains the destination index for each edge.
            Destination indices are in the range [0, V) (V: number of
            vertices).
            For weighted graphs, dataframe contains 'weight' column
            containing the weight value for each edge.
        """
        if self.edgelist is None:
            graph_wrapper.view_edge_list(self)
        if type(self) is Graph:
            edgelist_df = self.edgelist.edgelist_df[self.edgelist.edgelist_df[
                          'src'] <= self.edgelist.edgelist_df['dst']].\
                          reset_index(drop=True)
            self.edge_count = len(edgelist_df)
        else:
            edgelist_df = self.edgelist.edgelist_df

        if self.renumbered:
            if isinstance(self.edgelist.renumber_map, cudf.DataFrame):
                df = cudf.DataFrame()
                ncols = len(edgelist_df.columns) - 2
                unrnb_df_ = edgelist_df.merge(self.edgelist.renumber_map,
                                              left_on='src',
                                              right_on='id',
                                              how='left').drop(['id', 'src'])
                unrnb_df = unrnb_df_.merge(self.edgelist.renumber_map,
                                           left_on='dst',
                                           right_on='id',
                                           how='left').drop(['id', 'dst'])
                cols = unrnb_df.columns.to_list()
                df = unrnb_df[cols[ncols:] + cols[0:ncols]]
            else:
                df = cudf.DataFrame()
                for c in edgelist_df.columns:
                    if c in ['src', 'dst']:
                        df[c] = self.edgelist.renumber_map[edgelist_df[c]].\
                            reset_index(drop=True)
                    else:
                        df[c] = edgelist_df[c]
            return df
        else:
            return edgelist_df
示例#26
0
def compare(src1, dst1, val1, src2, dst2, val2):
    #
    #  We will do comparison computations by using dataframe
    #  merge functions (essentially doing fast joins).  We
    #  start by making two data frames
    #
    df1 = cudf.DataFrame()
    df1["src1"] = src1
    df1["dst1"] = dst1
    if val1 is not None:
        df1["val1"] = val1

    df2 = cudf.DataFrame()
    df2["src2"] = src2
    df2["dst2"] = dst2
    if val2 is not None:
        df2["val2"] = val2

    #
    #  Check to see if all pairs in the original data frame
    #  still exist in the new data frame.  If we join (merge)
    #  the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i])
    #  then we should get exactly the same number of entries in
    #  the data frame if we did not lose any data.
    #
    join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"])
    assert len(df1) == len(join)

    if val1 is not None:
        #
        #  Check the values.  In this join, if val1 and val2 are
        #  the same then we are good.  If they are different then
        #  we need to check if the value is selected from the opposite
        #  direction, so we'll merge with the edges reversed and
        #  check to make sure that the values all match
        #
        diffs = join.query("val1 != val2")
        diffs_check = diffs.merge(df1,
                                  left_on=["src1", "dst1"],
                                  right_on=["dst1", "src1"])
        query = diffs_check.query("val1_y != val2")
        if len(query) > 0:
            print("differences: ")
            print(query)
            assert 0 == len(query)

    #
    #  Now check the symmetrized edges are present.  If the original
    #  data contains (u,v) we want to make sure that (v,u) is present
    #  in the new data frame.
    #
    #  We can accomplish this by doing the join (merge) where
    #  (src1[i] = dst2[i]) and (dst1[i] = src2[i]), and verifying
    #  that we get exactly the same number of entries in the data frame.
    #
    join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["dst2", "src2"])
    assert len(df1) == len(join)

    if val1 is not None:
        #
        #  Check the values.  In this join, if val1 and val2 are
        #  the same then we are good.  If they are different then
        #  we need to check if the value is selected from the opposite
        #  direction, so we'll merge with the edges reversed and
        #  check to make sure that the values all match
        #
        diffs = join.query("val1 != val2")
        diffs_check = diffs.merge(df1,
                                  left_on=["src2", "dst2"],
                                  right_on=["src1", "dst1"])
        query = diffs_check.query("val1_y != val2")
        if len(query) > 0:
            print("differences: ")
            print(query)
            assert 0 == len(query)

    #
    #  Finally, let's check (in both directions) backwards.
    #  We want to make sure that no edges were created in
    #  the symmetrize logic that didn't already exist in one
    #  direction or the other.  This is a bit more complicated.
    #
    #  The complication here is that the original data could,
    #  for some edge (u,v) ALREADY contain the edge (v,u).  The
    #  symmetrized graph will not duplicate any edges, so the edge
    #  (u,v) will only be present once.  So we can't simply check
    #  counts of df2 joined with df1.
    #
    #  join1 will contain the join (merge) of df2 to df1 in the
    #        forward direction
    #  join2 will contain the join (merge) of df2 to df1 in the
    #        reverse direction
    #
    #  Finally, we'll do an outer join of join1 and join2, which
    #  will combine any (u,v)/(v,u) pairs that might exist into
    #  a joined row while keeping any (u,v) pairs that don't exist
    #  in both data frames as single rows.  This gives us a data frame
    #  with the same number of rows as the symmetrized data.
    #
    join1 = df2.merge(df1, left_on=["src2", "dst2"], right_on=["src1", "dst1"])
    join2 = df2.merge(df1, left_on=["src2", "dst2"], right_on=["dst1", "src1"])
    joinM = join1.merge(join2, how="outer", on=["src2", "dst2"])

    assert len(df2) == len(joinM)
示例#27
0
def test_mh_model_support(tmpdir):
    df = cudf.DataFrame({
        "Authors": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                    ["User_C"]],
        "Reviewers": [["User_A"], ["User_A", "User_E"], ["User_B", "User_C"],
                      ["User_C"]],
        "Engaging User": ["User_B", "User_B", "User_A", "User_D"],
        "Null User": ["User_B", "User_B", "User_A", "User_D"],
        "Post": [1, 2, 3, 4],
        "Cont1": [0.3, 0.4, 0.5, 0.6],
        "Cont2": [0.3, 0.4, 0.5, 0.6],
        "Cat1": ["A", "B", "A", "C"],
    })
    cat_names = ["Cat1", "Null User", "Authors",
                 "Reviewers"]  # , "Engaging User"]
    cont_names = ["Cont1", "Cont2"]
    label_name = ["Post"]
    out_path = os.path.join(tmpdir, "train/")
    os.mkdir(out_path)

    cats = cat_names >> ops.Categorify()
    conts = cont_names >> ops.Normalize()

    processor = nvt.Workflow(cats + conts + label_name)
    df_out = processor.fit_transform(nvt.Dataset(df)).to_ddf().compute()
    data_itr = torch_dataloader.TorchAsyncItr(
        nvt.Dataset(df_out),
        cats=cat_names,
        conts=cont_names,
        labels=label_name,
        batch_size=2,
    )
    emb_sizes = nvt.ops.get_embedding_sizes(processor)
    EMBEDDING_DROPOUT_RATE = 0.04
    DROPOUT_RATES = [0.001, 0.01]
    HIDDEN_DIMS = [1000, 500]
    LEARNING_RATE = 0.001
    model = Model(
        embedding_table_shapes=emb_sizes,
        num_continuous=len(cont_names),
        emb_dropout=EMBEDDING_DROPOUT_RATE,
        layer_hidden_dims=HIDDEN_DIMS,
        layer_dropout_rates=DROPOUT_RATES,
    ).cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    def rmspe_func(y_pred, y):
        "Return y_pred and y to non-log space and compute RMSPE"
        y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1
        pct_var = (y_pred - y) / y
        return (pct_var**2).mean().pow(0.5)

    train_loss, y_pred, y = process_epoch(
        data_itr,
        model,
        train=True,
        optimizer=optimizer,
        # transform=batch_transform,
        amp=False,
    )
    train_rmspe = None
    train_rmspe = rmspe_func(y_pred, y)
    assert train_rmspe is not None
    assert len(y_pred) > 0
    assert len(y) > 0
示例#28
0
def shortest_path_length(G, source, target=None):
    """
    Compute the distance from a source vertex to one or all vertexes in graph.
    Uses Single Source Shortest Path (SSSP).

    Parameters
    ----------
    graph : cuGraph.Graph, NetworkX.Graph, or CuPy sparse COO matrix
        cuGraph graph descriptor with connectivity information. Edge weights,
        if present, should be single or double precision floating point values.

    source : Dependant on graph type. Index of the source vertex.

    If graph is an instance of cuGraph.Graph or CuPy sparse COO matrix:
        int

    If graph is an instance of a NetworkX.Graph:
        str

    target: Dependant on graph type. Vertex to find distance to.

    If graph is an instance of cuGraph.Graph or CuPy sparse COO matrix:
        int

    If graph is an instance of a NetworkX.Graph:
        str

    Returns
    -------
    Return value type is based on the input type.

    If target is None, returns:

        cudf.DataFrame
            df['vertex']
                vertex id

            df['distance']
                gives the path distance from the starting vertex

    If target is not None, returns:

        Distance from source to target vertex.
    """

    # verify target is in graph before traversing
    if target is not None:
        if not hasattr(G, "has_node"):
            # G is a cupy coo_matrix. Extract maximum possible vertex value
            as_matrix = G.toarray()
            if target < 0 or target >= max(as_matrix.shape[0],
                                           as_matrix.shape[1]):
                raise ValueError("Graph does not contain target vertex")
        elif not G.has_node(target):
            # G is an instance of cugraph or networkx graph
            raise ValueError("Graph does not contain target vertex")

    df = sssp(G, source)

    if isinstance(df, tuple):
        # cupy path, df is tuple of (distance, predecessor)
        if target:
            return df[0][target - 1]
        results = cudf.DataFrame()
        results["vertex"] = range(df[0].shape[0])
        results["distance"] = df[0]
        return results

    else:
        # cugraph and networkx path
        if target:
            target_distance = df.loc[df["vertex"] == target]
            return target_distance.iloc[0]["distance"]

        results = cudf.DataFrame()
        results["vertex"] = df["vertex"]
        results["distance"] = df["distance"]
        return results
示例#29
0
 def _get_column_selection(self, arg):
     return cudf.DataFrame(self._df._get_columns_by_index(arg))
示例#30
0
def np_to_cudf(X):
    df = cudf.DataFrame()
    for i in range(X.shape[1]):
        df['fea%d' % i] = cuda.to_device(np.ascontiguousarray(X[:, i]))
    return df