예제 #1
0
def dataset(path, treepath, namespace=None, **kwargs):
    import uproot

    if namespace is None:
        namespace = "root({0}, {1})".format(repr(path), repr(treepath))

    if "localsource" not in kwargs:
        kwargs["localsource"] = lambda path: uproot.source.file.FileSource(
            path, chunkbytes=8 * 1024, limitbytes=None)
    kwargs["total"] = False
    kwargs["blocking"] = True

    paths2entries = uproot.tree.numentries(path, treepath, **kwargs)
    if len(paths2entries) == 0:
        raise ValueError("path {0} matched no TTrees".format(repr(path)))

    offsets = [0]
    paths = []
    for path, numentries in paths2entries.items():
        offsets.append(offsets[-1] + numentries)
        paths.append(path)

    sch = schema(paths[0], treepath, namespace=namespace)
    doc = sch.doc
    sch.doc = None

    return oamap.dataset.Dataset(treepath.split("/")[-1].split(";")[0],
                                 sch,
                                 {namespace: ROOTBackend(paths, treepath)},
                                 oamap.dataset.SingleThreadExecutor(),
                                 offsets,
                                 extension=None,
                                 packing=None,
                                 doc=doc,
                                 metadata={"schemafrom": paths[0]})
예제 #2
0
    def test_Pointer(self):
        class Node(object):
            def __init__(self, label, next):
                self.label = label
                self.next = next

        schema = Record({"label": Primitive("i8")}, name="Node")
        schema["next"] = Pointer(schema)
        value = Node(0, Node(1, Node(2, None)))
        value.next.next.next = value

        arrays = oamap.fill.fromdata(value, schema)
        columnar = schema(arrays)

        self.assertEqual(value.label, columnar.label)
        self.assertEqual(value.next.label, columnar.next.label)
        self.assertEqual(value.next.next.label, columnar.next.next.label)
        self.assertEqual(value.next.next.next.label,
                         columnar.next.next.next.label)
        self.assertEqual(value.next.next.next.next.label,
                         columnar.next.next.next.next.label)
        self.assertEqual(value.next.next.next.next.next.label,
                         columnar.next.next.next.next.next.label)
        self.assertEqual(value.next.next.next.next.next.next.label,
                         columnar.next.next.next.next.next.next.label)
예제 #3
0
 def check(self, value, schema=None, debug=False):
     if schema is None:
         schema = oamap.inference.fromdata(value)
     if debug:
         print("schema: {0}".format(schema))
     arrays = oamap.fill.fromdata(value, schema)
     if debug:
         print("arrays:")
         for n in sorted(arrays):
             print("  {0}: {1}".format(repr(n), arrays[n]))
     columnar = schema(arrays)
     if debug:
         print("columnar: {0}".format(columnar))
     value2 = oamap.proxy.tojson(columnar)
     self.assertEqual(value, value2)
예제 #4
0
파일: npz.py 프로젝트: martindurant/oamap
def load(npzfile, prefix="object", delimiter="-"):
    if not isinstance(npzfile, numpy.lib.npyio.NpzFile):
        npzfile = numpy.load(npzfile)
    if not isinstance(npzfile, numpy.lib.npyio.NpzFile):
        raise TypeError(
            "npzfile must be a Numpy NpzFile (e.g. oamap.source.npz.load(numpy.load(\"filename.npz\")))"
        )

    try:
        datasetarray = npzfile[prefix]
        assert datasetarray.dtype == numpy.dtype(numpy.uint8) and len(
            datasetarray.shape) == 1
        dataset = oamap.schema.Dataset.fromjsonstring(datasetarray.tostring())
    except:
        schema = oamap.inference.fromnames(npzfile.keys(),
                                           prefix=prefix,
                                           delimiter=delimiter)
    else:
        schema = dataset.schema

    return schema(npzfile)
예제 #5
0
def proxy(table):
    import pyarrow

    class _ArrayDict(object):
        def __init__(self, table):
            self.table = table

        def chop(self, name):
            slashindex = name.rindex("/")
            return name[:slashindex], int(name[slashindex + 1:])

        def frombuffer(self, chunk, bufferindex):
            def truncate(array, length, offset=0):
                return array[:length + offset]

            def mask(index, length):
                buf = chunk.buffers()[index]
                if buf is None:
                    return numpy.arange(length,
                                        dtype=oamap.generator.Masked.maskdtype)
                else:
                    unmasked = truncate(
                        numpy.unpackbits(
                            numpy.frombuffer(buf, dtype=numpy.uint8)).view(
                                numpy.bool_), length)
                    mask = numpy.empty(len(unmasked),
                                       dtype=oamap.generator.Masked.maskdtype)
                    mask[unmasked] = numpy.arange(unmasked.sum(),
                                                  dtype=mask.dtype)
                    mask[~unmasked] = oamap.generator.Masked.maskedvalue
                    return mask

            def recurse(tpe, index, length):
                if isinstance(tpe, pyarrow.lib.ListType):
                    if index == bufferindex:
                        # list mask
                        return mask(index, length)
                    elif index + 1 == bufferindex:
                        # list starts
                        return truncate(
                            numpy.frombuffer(chunk.buffers()[index + 1],
                                             dtype=numpy.int32), length, 1)
                    else:
                        # descend into list
                        length = truncate(
                            numpy.frombuffer(chunk.buffers()[index + 1],
                                             dtype=numpy.int32), length, 1)[-1]
                        return recurse(tpe.value_type, index + 2, length)

                elif isinstance(tpe, pyarrow.lib.DataType):
                    if index == bufferindex:
                        # data mask
                        return mask(index, length)
                    elif index + 1 == bufferindex:
                        # data
                        return truncate(
                            numpy.frombuffer(chunk.buffers()[index + 1],
                                             dtype=tpe.to_pandas_dtype()),
                            length)
                    else:
                        raise AssertionError

                else:
                    raise NotImplementedError

            return recurse(chunk.type, 0, len(chunk))

        def getall(self, names):
            out = {}
            for name in names:
                if len(str(name)) == 0:
                    if isinstance(name, oamap.generator.StartsRole):
                        out[name] = numpy.array(
                            [0], dtype=oamap.generator.ListGenerator.posdtype)
                    elif isinstance(name, oamap.generator.StopsRole):
                        out[name] = numpy.array(
                            [self.table.num_rows],
                            dtype=oamap.generator.ListGenerator.posdtype)
                    else:
                        raise AssertionError

                elif isinstance(name, oamap.generator.StopsRole):
                    out[name] = out[name.starts][1:]

                else:
                    columnname, bufferindex = self.chop(str(name))
                    column = self.table[self.table.schema.names.index(
                        columnname)]
                    chunks = column.data.chunks
                    if len(chunks) == 0:
                        raise ValueError(
                            "Arrow column {0} has no chunks".format(
                                repr(columnname)))
                    elif len(chunks) == 1:
                        out[name] = self.frombuffer(chunks[0], bufferindex)
                    else:
                        out[name] = numpy.concatenate([
                            self.frombuffer(chunk, bufferindex)
                            for chunk in chunks
                        ])

            return out

    return schema(table)(_ArrayDict(table))