Пример #1
0
def check_set_bool_iv(N):
    # create np version
    a = np.arange(N)
    a[a < N // 2] = a[:N // 2] * -1
    a = ak.array(a)
    # create ak version
    b = ak.arange(N)
    b[b < N // 2] = b[:N // 2] * -1
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
Пример #2
0
def check_get_slice(N):
    # create np version
    a = np.ones(N)
    a = a[::2]
    a = ak.array(a)
    # create ak version
    b = ak.ones(N)
    b = b[::2]
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
Пример #3
0
def check_correctness(dtype, random):
    Ni = 10**4
    Nv = 10**4
    # make indices unique
    # if indices are non-unique, results of unordered scatter are variable
    npi = np.arange(Ni)
    np.random.shuffle(npi)
    npc = np.zeros(Nv, dtype=dtype)
    aki = ak.array(npi)
    akc = ak.zeros(Nv, dtype=dtype)
    if random:
        if dtype == 'int64':
            npv = np.random.randint(0, 2**32, Ni)
        elif dtype == 'float64':
            npv = np.random.random(Ni)
    else:
        npv = np.ones(Ni, dtype=dtype)
    akv = ak.array(npv)
    npc[npi] = npv
    akc[aki] = akv
    assert np.allclose(npc, akc.to_ndarray())
Пример #4
0
def check_bool(N):
    a = ak.arange(N)
    b = ak.ones(N)
    try:
        c = a and b
    except ValueError:
        correct = True
    except:
        correct = False
    d = ak.array([1])
    correct = correct and (d and 5)
    return pass_fail(correct)
Пример #5
0
def check_set_slice_value(N):
    # create np version
    a = np.ones(N)
    a[::2] = -1
    a = ak.array(a)
    # create ak version
    b = ak.ones(N)
    b[::2] = -1
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
Пример #6
0
 def locate(self,key):
     """Lookup values by index label
     
     The input can be a scalar, a list of scalers, or a list of lists (if the series has a MultiIndex).
     As a special case, if a Series is used as the key, the series labels are preserved with its values
     use as the key.
     
     Keys will be turned into arkouda arrays as needed.
     
     Returns
     -------
     
     A Series containing the values corresponding to the key.
     """
     t =type(key)
     if isinstance(key,Series):
         # special case, keep the index values of the Series, and lookup the values
         labels = key.index
         key = key.values
         v = aku.lookup(self.index.index,self.values,key)
         return Series( (labels, v))
     elif isinstance(key,ak.pdarrayclass.pdarray):
         idx = self.index.lookup(key)
     elif t == list or t == tuple:
         key0 = key[0]
         if isinstance(key0,list) or isinstance(key0,tuple):
             # nested list. check if already arkouda arrays
             if not isinstance(key0[0], ak.pdarrayclass.pdarray):
                 # convert list of lists to list of pdarrays
                 key = [ ak.array(a) for a in np.array(key).T.copy() ]
             
         elif not isinstance(key0,ak.pdarrayclass.pdarray):
             # a list of scalers, convert into arkouda array
             key = ak.array(key)
         # else already list if arkouda array, use as is
         idx = self.index.lookup(key)
     else:
         # scalar value
         idx = self.index == key 
     return Series( (self.index[idx], self.values[idx]) )
Пример #7
0
def check_correctness(dtype, random, seed):
    N = 10**4

    if seed is not None:
        np.random.seed(seed)
    if dtype == 'int64':
        a = np.random.randint(1, N, N)
    elif dtype == 'float64':
        a = np.random.random(N) + 0.5

    aka = ak.array(a)
    npa = aka.to_ndarray()
    assert np.allclose(a, npa)
Пример #8
0
def check_sort(N):
    # create np version
    a = np.arange(N)
    a = a[::-1]
    a = np.sort(a)
    a = ak.array(a)
    # create ak version
    b = ak.arange(N)
    b = b[::-1]
    b = ak.sort(b)
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
Пример #9
0
def check_set_integer_iv(N):
    # create np version
    a = np.arange(N)
    iv = np.arange(N // 2)
    a[iv] = iv * 10
    a = ak.array(a)
    # create ak version
    b = ak.arange(N)
    iv = ak.arange(N // 2)
    b[iv] = iv * 10
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
Пример #10
0
def conn_comp(src, dst, printCComp=False, printLayers=False):
    unvisited = ak.unique(src)
    if printCComp: print("unvisited size = ", unvisited.size, unvisited)
    components = []
    while unvisited.size > 0:
        # use lowest numbered vertex as representative vertex 
        rep_vertex = unvisited[0]
        # bfs from rep_vertex
        layers,visited = bfs(src,dst,ak.array([rep_vertex]),printLayers)
        # add verticies in component to list of components
        components.append(visited)
        # subtract out visited from unvisited vertices
        unvisited = ak.setdiff1d(unvisited,visited)
        if printCComp: print("  visited size = ", visited.size, visited)
        if printCComp: print("unvisited size = ", unvisited.size, unvisited)
    return components
Пример #11
0
    def drop_duplicates(self, subset=None, keep='first'):
        """
        Drops duplcated rows and returns resulting DataFrame. 
        
        If a subset of the columns are provided then only one instance of each 
        duplicated row will be returned (keep determines which row).

        Parameters
        ----------
        subset : Iterable of column names to use to dedupe.
        keep : {'first', 'last'}, default 'first'
            Determines which duplicates (if any) to keep.

        Returns
        -------
        DataFrame
            DataFrame with duplicates removed.
        """
        if self._empty:
            return self

        if not subset:
            subset = self._columns[1:]

        if len(subset) == 1:
            if not subset[0] in self.data:
                raise KeyError("{} is not a column in the DataFrame.".format(
                    subset[0]))
            _ = ak.GroupBy(self.data[subset[0]])

        else:
            for col in subset:
                if not col in self.data:
                    raise KeyError(
                        "{} is not a column in the DataFrame.".format(
                            subset[0]))

            _ = ak.GroupBy([self.data[col] for col in subset])

        if keep == 'last':
            _segment_ends = ak.concatenate(
                [_.segments[1:] - 1,
                 ak.array([_.permutation.size - 1])])
            return self[_.permutation[_segment_ends]]
        else:
            return self[_.permutation[_.segments]]
Пример #12
0
    def sample(self, n=5):
        """
        Return a random sample of `n` rows.

        Parameters
        ----------
        n : int (default=5)
            Number of rows to return.

        Returns
        -------
        akutil.DataFrame
            The sampled `n` rows of the DataFrame.
        """
        self.update_size()
        if self._size <= n:
            return self
        return self[ak.array(random.sample(range(self._size), n))]
Пример #13
0
 def __init__(self, ar_tuple=None,data=None, index=None):
     if ar_tuple is not None:
         self.index = aku.Index.factory(ar_tuple[0])
         self.values = ar_tuple[1]
     elif data is None:
         raise TypeError("ar_tuple and data cannot both be null")
     
     else:
         if not isinstance(data,ak.pdarrayclass.pdarray):
             data = ak.array(data)
         self.values= data
         
         if index is None:
             index = ak.arange(data.size)
         self.index = aku.Index.factory(index)
     if self.index.size != self.values.size:
         raise ValueError("Index and data must have same length")
     self.size = self.index.size
Пример #14
0
def check_correctness(dtype, random):
    N = 10**4
    if random:
        if dtype == 'int64':
            a = np.random.randint(0, 2**32, N)
        elif dtype == 'float64':
            a = np.random.random(N)
    else:
        if dtype == 'int64':
            a = np.arange(0, N, 1, dtype=dtype)
        elif dtype == 'float64':
            a = np.arange(1, 1 + 1 / N, (1 / N) / N, dtype=dtype)

    for op in OPS:
        npa = a
        aka = ak.array(a)
        fxn = getattr(npa, op)
        npr = fxn()
        fxn = getattr(aka, op)
        akr = fxn()
        assert np.isclose(npr, akr)
Пример #15
0
def time_ak_read(N_per_locale, numfiles, trials, dtype, path, seed, parquet):
    print(">>> arkouda {} read".format(dtype))
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}, filesPerLoc = {}".format(
        cfg["numLocales"], N, numfiles))
    a = ak.array([])

    readtimes = []
    for i in range(trials):
        start = time.time()
        a = ak.read_all(path + '*') if not parquet else ak.read_parquet(path +
                                                                        '*')
        end = time.time()
        readtimes.append(end - start)
    avgread = sum(readtimes) / trials

    print("read Average time = {:.4f} sec".format(avgread))

    nb = a.size * a.itemsize
    print("read Average rate = {:.2f} GiB/sec".format(nb / 2**30 / avgread))
Пример #16
0
def expand(size, segs, vals):
    """ Expand an array with values placed into the indicated segments.

    Parameters
    ----------
    size : ak.pdarray
        The size of the array to be expanded
    segs : ak.pdarray
        The indices where the values should be placed
    vals : ak.pdarray
        The values to be placed in each segment

    Returns
    -------
    pdarray
        The expanded array.

    """
    temp = ak.zeros(size, vals.dtype)
    diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1]))
    temp[segs] = diffs
    return ak.cumsum(temp)
Пример #17
0
    def coargsort(self, keys, ascending=True):
        """
        Return the permutation that sorts the dataframe by `keys`.

        Parameters
        ----------
        keys : list
            The keys to sort on.

        Returns
        -------
        ak.pdarray
            The permutation array that sorts the data on `keys`.
        """

        if self._empty:
            return ak.array([], dtype=ak.int64)
        arrays = []
        for key in keys:
            arrays.append(self[key])
        i = ak.coargsort(arrays)
        if not ascending:
            i = i[ak.arange(self.size - 1, -1, -1)]
        return i
Пример #18
0
    def concat(cls, x, axis=0, ordered=True):
        """
        Concatenate a sequence of SegArrays

        Parameters
        ----------
        x : sequence of SegArray
            The SegArrays to concatenate
        axis : 0 or 1
            Select vertical (0) or horizontal (1) concatenation. If axis=1, all
            SegArrays must have same size.
        ordered : bool
            Must be True. This option is present for compatibility only, because unordered
            concatenation is not yet supported.

        Returns
        -------
        SegArray
            The input arrays joined into one SegArray
        """
        if not ordered:
            raise ValueError(
                "Unordered concatenation not yet supported on SegArray; use ordered=True."
            )
        if len(x) == 0:
            raise ValueError("Empty sequence passed to concat")
        for xi in x:
            if not isinstance(xi, cls):
                return NotImplemented
        if len(set(xi.dtype for xi in x)) != 1:
            raise ValueError(
                "SegArrays must all have same dtype to concatenate")
        if axis == 0:
            ctr = 0
            segs = []
            vals = []
            for xi in x:
                # Segment offsets need to be raised by length of previous values
                segs.append(xi.segments + ctr)
                ctr += xi.valsize
                # Values can just be concatenated
                vals.append(xi.values)
            return cls(ak.concatenate(segs), ak.concatenate(vals))
        elif axis == 1:
            sizes = set(xi.size for xi in x)
            if len(sizes) != 1:
                raise ValueError(
                    "SegArrays must all have same size to concatenate with axis=1"
                )
            if sizes.pop() == 0:
                return x[0]
            dt = list(x)[0].dtype
            newlens = sum(xi.lengths for xi in x)
            newsegs = ak.cumsum(newlens) - newlens
            # Ignore sub-arrays that are empty in all arrays
            nonzero = ak.concatenate(
                (newsegs[:-1] < newsegs[1:], ak.array([True])))
            nzsegs = newsegs[nonzero]
            newvals = ak.zeros(newlens.sum(), dtype=dt)
            for xi in x:
                # Set up fromself for a scan, so that it steps up at the start of a segment
                # from the current array, and steps back down at the end
                fromself = ak.zeros(newvals.size + 1, dtype=ak.int64)
                fromself[nzsegs] += 1
                nzlens = xi.lengths[nonzero]
                fromself[nzsegs + nzlens] -= 1
                fromself = (ak.cumsum(fromself[:-1]) == 1)
                newvals[fromself] = xi.values
                nzsegs += nzlens
            return cls(newsegs, newvals, copy=False)
        else:
            raise ValueError(
                "Supported values for axis are 0 (vertical concat) or 1 (horizontal concat)"
            )
Пример #19
0
        description="Example of cosine distance/similarity in arkouda")
    parser.add_argument('--server',
                        default="localhost",
                        help='server/Hostname of arkouda server')
    parser.add_argument('--port',
                        type=int,
                        default=5555,
                        help='Port of arkouda server')
    args = parser.parse_args()

    ak.v = False
    ak.connect(server=args.server, port=args.port)

    u1 = [1, 0, 0]
    v1 = [0, 1, 0]
    d1 = ak_cos_dist(ak.array(u1), ak.array(v1))
    print("d1 = ", d1)
    # d1 should be 1.0
    assert (np.allclose(d1, distance.cosine(u1, v1)))

    u2 = [100, 0, 0]
    d2 = ak_cos_dist(ak.array(u2), ak.array(v1))
    print("d2 = ", d2)
    # d2 should be 1.0
    assert (np.allclose(d2, distance.cosine(u2, v1)))

    u3 = [1, 1, 0]
    d3 = ak_cos_dist(ak.array(u3), ak.array(v1))
    print("d3 = ", d3)
    # d3 should be 0.29289321881345254
    assert (np.allclose(d3, distance.cosine(u3, v1)))
Пример #20
0
import numpy as np
import math
import gc
import sys

import arkouda as ak

ak.v = False
if len(sys.argv) > 1:
    ak.connect(server=sys.argv[1], port=sys.argv[2])
else:
    ak.connect()

a = ak.arange(0, 10, 1)
b = np.linspace(10, 20, 10)
c = ak.array(b)
d = a + c
e = d.to_ndarray()
    
a = ak.ones(10)
a[::2] = 0
print(a)

a = ak.ones(10)
b = ak.zeros(5)
a[1::2] = b
print(a)

a = ak.zeros(10) # float64
b = ak.arange(0,10,1) # int64
a[:] = b # cast b to float64
Пример #21
0
    return all(x == y for x, y in zip(a, b))


errors = False
if __name__ == '__main__':
    if len(sys.argv) > 1:
        ak.connect(server=sys.argv[1], port=sys.argv[2])
    else:
        ak.connect()

    with open(__file__, 'r') as f:
        base_words = np.array(f.read().split())

    test_strings = np.random.choice(base_words, N, replace=True)

    strings = ak.array(test_strings)
    cat = ak.Categorical(strings)
    print("strings =", strings)
    print("categorical =", cat)

    # int index
    assert (strings[N // 3] == test_strings[N // 3])
    assert (cat[N // 3] == test_strings[N // 3])
    print("int index passed")

    # slice
    assert (compare_strings(strings[N // 4:N // 3].to_ndarray(),
                            test_strings[N // 4:N // 3]))
    assert (compare_strings(cat[N // 4:N // 3].to_ndarray(),
                            test_strings[N // 4:N // 3]))
    print("slice passed")
Пример #22
0
def run_test(levels):
    d = make_arrays()
    df = pd.DataFrame(d)
    akdf = {k:ak.array(v) for k, v in d.items()}
    if levels == 1:
        akg = ak.GroupBy(akdf['keys'])
        keyname = 'keys'
    elif levels == 2:
        akg = ak.GroupBy([akdf['keys'], akdf['keys2']])
        keyname = ['keys', 'keys2']
    tests = 0
    failures = 0
    not_impl = 0
    print(f"Doing .count()")
    tests += 1
    pdkeys, pdvals = groupby_to_arrays(df, keyname, 'int64', 'count', levels)
    # print("Pandas:")
    # print(pdkeys)
    # print(pdvals)
    akkeys, akvals = akg.count()
    # akkeys = akkeys.to_ndarray()
    akvals = akvals.to_ndarray()
    # print("Arkouda:")
    # print(akkeys)
    # print(akvals)
    # if not np.allclose(pdkeys, akkeys):
    #     print(f"Different keys")
    #     failures += 1
    failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals)
    # elif not np.allclose(pdvals, akvals):
    #     print(f"Different values (abs diff = {np.abs(pdvals - akvals).sum()})")
    #     failures += 1
    for vname in ('int64', 'float64', 'bool'):
        for op in ak.GroupBy.Reductions:
            print(f"\nDoing aggregate({vname}, {op})")
            tests += 1
            do_check = True
            try:
                pdkeys, pdvals = groupby_to_arrays(df, keyname, vname, op, levels)
                # print("Pandas:")
                # print(pdkeys)
                # print(pdvals)
            except Exception as E:
                print("Pandas does not implement")
                do_check = False
            try:
                akkeys, akvals = akg.aggregate(akdf[vname], op)
                # akkeys = akkeys.to_ndarray()
                akvals = akvals.to_ndarray()
                # print("Arkouda:")
                # print(akkeys)
                # print(akvals)
            except RuntimeError as E:
                print("Arkouda error: ", E)
                not_impl += 1
                do_check = False
                continue
            if not do_check:
                continue
            if op.startswith('arg'):
                pdextrema = df[vname][pdvals]
                akextrema = akdf[vname][ak.array(akvals)].to_ndarray()
                if not np.allclose(pdextrema, akextrema):
                    print(f"Different argmin/argmax: Arkouda failed to find an extremum")
                    print("pd: ", pdextrema)
                    print("ak: ", akextrema)
                    failures += 1
            else:
                # if not np.allclose(pdkeys, akkeys):
                #     print(f"Different keys")
                #     failures += 1
                failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals)
                # elif not np.allclose(pdvals, akvals):
                #     print(f"Different values (abs diff = {np.where(np.isfinite(pdvals) & np.isfinite(akvals), np.abs(pdvals - akvals), 0).sum()})")
                #     failures += 1
    print(f"\n{failures} failures in {tests} tests ({not_impl} not implemented)")
Пример #23
0
def inner_join(left, right, wherefunc=None, whereargs=None):
    '''Perform inner join on values in <left> and <right>, 
    using conditions defined by <wherefunc> evaluated on 
    <whereargs>, returning indices of left-right pairs. 

    Parameters
    ----------
    left : pdarray(int64)
        The left values to join
    right : pdarray(int64)
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns 
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray
        The two pdarray arguments to wherefunc
        
    Returns
    -------
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition
        
    Notes
    -----
    The return values satisfy the following assertions
    
    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`
        
    '''
    from inspect import signature
    sample = min((left.size, right.size, 5))
    if wherefunc is not None:
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError(
                "wherefunc must be a function that accepts exactly two arguments"
            )
        if whereargs is None or len(whereargs) != 2:
            raise ValueError(
                "whereargs must be a 2-tuple with left and right arg arrays")
        if whereargs[0].size != left.size:
            raise ValueError(
                "Left whereargs must be same size as left join values")
        if whereargs[1].size != right.size:
            raise ValueError(
                "Right whereargs must be same size as right join values")
        try:
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e

    # Need dense 0-up right index, to filter out left not in right
    keep, (denseLeft, denseRight) = right_align(left, right)
    keep = ak.arange(keep.size)[keep]
    # GroupBy right
    byRight = ak.GroupBy(denseRight)
    # Get segment boundaries (starts, ends) of right for each left item
    rightSegs = ak.concatenate((byRight.segments, ak.array([denseRight.size])))
    starts = rightSegs[denseLeft]
    ends = rightSegs[denseLeft + 1]
    fullSize = (ends - starts).sum()
    # print(f"{left.size+right.size:,} input rows --> {fullSize:,} joins ({fullSize/(left.size+right.size):.1f} x) ")
    # gen_ranges for gather of right items
    fullSegs, ranges = gen_ranges(starts, ends)
    # Evaluate where clause
    if wherefunc is None:
        filtRanges = ranges
        filtSegs = fullSegs
        keep12 = keep
    else:
        # Gather right whereargs
        rightWhere = whereargs[1][byRight.permutation][ranges]
        # Expand left whereargs
        leftWhere = expand(whereargs[0][keep], fullSegs, ranges.size)
        # Evaluate wherefunc and filter ranges, recompute segments
        whereSatisfied = wherefunc(leftWhere, rightWhere)
        filtRanges = ranges[whereSatisfied]
        scan = ak.cumsum(whereSatisfied) - whereSatisfied
        filtSegsWithZeros = scan[fullSegs]
        filtSegSizes = ak.concatenate(
            (filtSegsWithZeros[1:] - filtSegsWithZeros[:-1],
             ak.array([whereSatisfied.sum() - filtSegsWithZeros[-1]])))
        keep2 = (filtSegSizes > 0)
        filtSegs = filtSegsWithZeros[keep2]
        keep12 = keep[keep2]
    # Gather right inds and expand left inds
    rightInds = byRight.permutation[filtRanges]
    leftInds = expand(ak.arange(left.size)[keep12], filtSegs, filtRanges.size)
    return leftInds, rightInds
Пример #24
0
                               times=times,
                               includeDelimiter=inc,
                               keepPartial=part)
        triples = [s.rpartition(delim) for s in test_strings]
        for i in range(times - 1):
            triples = [rslide(t, delim) for t in triples]
        ltest, rtest = rmunge(triples, inc, part)
        assert ((ltest == ls.to_ndarray()).all()
                and (rtest == rs.to_ndarray()).all())

    print("peel passed")

    # stick

    test_strings2 = np.random.choice(base_words, N, replace=True)
    strings2 = ak.array(test_strings2)
    stuck = strings.stick(strings2, delimiter=delim).to_ndarray()
    tstuck = np.array(
        [delim.join((a, b)) for a, b in zip(test_strings, test_strings2)])
    assert ((stuck == tstuck).all())
    assert ((strings + strings2) == strings.stick(strings2,
                                                  delimiter="")).all()

    lstuck = strings.lstick(strings2, delimiter=delim).to_ndarray()
    tlstuck = np.array(
        [delim.join((b, a)) for a, b in zip(test_strings, test_strings2)])
    assert ((lstuck == tlstuck).all())
    assert ((strings2 + strings) == strings.lstick(strings2,
                                                   delimiter="")).all()
    print("stick passed")