Пример #1
0
    def fillpage(self, values, copy=True):
        if set(values.keys()) != set(self.fields):
            raise ValueError("fields in initExisting (%s) differ from original fields (%s)" % (str(set(values.keys())), str(set(self.fields))))

        lengths = [len(arr) for arr in values.values()]
        same = [l == lengths[0] for l in lengths]
        if False in same:
            raise ValueError("arrays have different lengths: %s" % lengths)

        if not hasattr(self, "pages"):
            raise UninitializedError("UniTable initMemory or initExisting must be called before fillpage")

        page = UniPage(self.fields, self.types)
        page.initExisting(lengths[0], values, copy=copy)

        if self.pages[-1].length == 0:
            # empty page (probably just called initMemory); replace it and leave the starts list as it is
            self.pages[-1] = page
            
        else:
            # non-empty page (either full from fillpage() or partially full from fill()); write it an add this new page
            # _writing() has protection against being called twice
            self._writing()
            self.pages.append(page)
            self.starts.append(self.starts[-1] + lengths[0])

        # either way, update lengths
        self.length += lengths[0]

        # write out the new page and cull any excess (_cullPages() also has protection against being called twice)
        self._writing()
        self._cullPages()
Пример #2
0
    def initMemory(self, pageSize=None):
        if pageSize is None:
            pageSize = default["pageSize"]

        if pageSize <= 0:
            raise ValueError("UniTable pageSize must be positive (not %d)" % pageSize)

        self.pageSize = pageSize
        page = UniPage(self.fields, self.types)
        page.initMemory(self.pageSize)
        self.pages = [page]
        self.starts = [0]
        self.length = 0
Пример #3
0
    def initMemory(self, pageSize=None):
        if pageSize is None:
            pageSize = default["pageSize"]

        if pageSize <= 0:
            raise ValueError("UniTable pageSize must be positive (not %d)" %
                             pageSize)

        self.pageSize = pageSize
        page = UniPage(self.fields, self.types)
        page.initMemory(self.pageSize)
        self.pages = [page]
        self.starts = [0]
        self.length = 0
Пример #4
0
    def initExisting(self, values, copy=True):
        if set(values.keys()) != set(self.fields):
            raise ValueError("fields in initExisting (%s) differ from original fields (%s)" % (str(set(values.keys())), str(set(self.fields))))

        lengths = [len(arr) for arr in values.values()]
        same = [l == lengths[0] for l in lengths]
        if False in same:
            raise ValueError("arrays have different lengths: %s" % lengths)

        self.pageSize = lengths[0]
        page = UniPage(self.fields, self.types)
        page.initExisting(self.pageSize, values, copy=copy)
        self.pages = [page]
        self.starts = [0]
        self.length = self.pageSize
Пример #5
0
    def initExisting(self, values, copy=True):
        if set(values.keys()) != set(self.fields):
            raise ValueError(
                "fields in initExisting (%s) differ from original fields (%s)"
                % (str(set(values.keys())), str(set(self.fields))))

        lengths = [len(arr) for arr in values.values()]
        same = [l == lengths[0] for l in lengths]
        if False in same:
            raise ValueError("arrays have different lengths: %s" % lengths)

        self.pageSize = lengths[0]
        page = UniPage(self.fields, self.types)
        page.initExisting(self.pageSize, values, copy=copy)
        self.pages = [page]
        self.starts = [0]
        self.length = self.pageSize
Пример #6
0
    def fill(self, values):
        try:
            self.pages[-1].fill(values)
            self.length += 1

        except BeyondPageException:
            self.starts.append(self.starts[-1] + self.pages[-1].allocation)

            self._writing()

            page = UniPage(self.fields, self.types)
            page.initMemory(self.pageSize)
            page.categories = self.pages[-1].categories
            self.pages.append(page)
            self.pages[-1].fill(values)
            self.length += 1

            self._cullPages()

        except AttributeError:
            raise UninitializedError("UniTable initMemory or initExisting must be called before fill")
Пример #7
0
    def fill(self, values):
        try:
            self.pages[-1].fill(values)
            self.length += 1

        except BeyondPageException:
            self.starts.append(self.starts[-1] + self.pages[-1].allocation)

            self._writing()

            page = UniPage(self.fields, self.types)
            page.initMemory(self.pageSize)
            page.categories = self.pages[-1].categories
            self.pages.append(page)
            self.pages[-1].fill(values)
            self.length += 1

            self._cullPages()

        except AttributeError:
            raise UninitializedError(
                "UniTable initMemory or initExisting must be called before fill"
            )
Пример #8
0
    def fillpage(self, values, copy=True):
        if set(values.keys()) != set(self.fields):
            raise ValueError(
                "fields in initExisting (%s) differ from original fields (%s)"
                % (str(set(values.keys())), str(set(self.fields))))

        lengths = [len(arr) for arr in values.values()]
        same = [l == lengths[0] for l in lengths]
        if False in same:
            raise ValueError("arrays have different lengths: %s" % lengths)

        if not hasattr(self, "pages"):
            raise UninitializedError(
                "UniTable initMemory or initExisting must be called before fillpage"
            )

        page = UniPage(self.fields, self.types)
        page.initExisting(lengths[0], values, copy=copy)

        if self.pages[-1].length == 0:
            # empty page (probably just called initMemory); replace it and leave the starts list as it is
            self.pages[-1] = page

        else:
            # non-empty page (either full from fillpage() or partially full from fill()); write it an add this new page
            # _writing() has protection against being called twice
            self._writing()
            self.pages.append(page)
            self.starts.append(self.starts[-1] + lengths[0])

        # either way, update lengths
        self.length += lengths[0]

        # write out the new page and cull any excess (_cullPages() also has protection against being called twice)
        self._writing()
        self._cullPages()
Пример #9
0
def readUniTable(fileLocation, format=None, sorter=None, pageSize=None, mapInvalid=None, mapMissing=None, **parameters):
    format = getformat(fileLocation, format)

    ################################################################ CSV
    if format == "CSV":
        csvInput = CSVStream(fileLocation, sorter, **parameters)

        if csvInput.types is not None:
            types = csvInput.types
        else:
            types = dict((f, "string") for f in csvInput.fields)

        _mapInvalid = dict((f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in csvInput.fields)
        if mapInvalid is None:
            mapInvalid = _mapInvalid
        else:
            _mapInvalid.update(mapInvalid)
            mapInvalid = _mapInvalid

        _mapMissing = dict((f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in csvInput.fields)
        if mapMissing is None:
            mapMissing = _mapMissing
        else:
            _mapMissing.update(mapMissing)
            mapMissing = _mapMissing

        table = UniTable(csvInput.fields, types)
        table.initMemory(pageSize)

        for record in csvInput:
            table.fill([mapInvalid[f] if r is INVALID else mapMissing[f] if r is MISSING else r for f, r in zip(csvInput.fields, record)])

        return table

    ################################################################ XML
    if format == "XML":
        xmlInput = XMLStream(fileLocation, sorter, **parameters)

        if xmlInput.types is not None:
            types = xmlInput.types
        else:
            types = dict((f, "string") for f in xmlInput.fields)

        _mapInvalid = dict((f, str("INVALID") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields)
        if mapInvalid is None:
            mapInvalid = _mapInvalid
        else:
            _mapInvalid.update(mapInvalid)
            mapInvalid = _mapInvalid

        _mapMissing = dict((f, str("MISSING") if types[f] in ("category", "string") else -1000) for f in xmlInput.fields)
        if mapMissing is None:
            mapMissing = _mapMissing
        else:
            _mapMissing.update(mapMissing)
            mapMissing = _mapMissing

        table = UniTable(xmlInput.fields, types)
        table.initMemory(pageSize)

        for record in xmlInput:
            table.fill([mapInvalid[f] if r is INVALID else r for f, r in [(f, record.get(f, mapMissing[f])) for f in xmlInput.fields]])

        return table

    ################################################################ NAB
    elif format == "NAB":
        fileNames = getfiles(fileLocation, sorter)
        if len(fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation)

        fields = None
        types = None
        strings = {}
        args = {}
        for fileName in fileNames:
            file = open(fileName, "rb")
            header = file.readline().rstrip()
            file.close()

            headerfields = header.decode("utf-8").split()
            if headerfields[0] != "RecArray":
                raise BadlyFormattedInputData("NAB file \"%s\" does not begin with 'RecArray'" % fileName)

            args[fileName] = dict(asciistr(f).split("=") for f in headerfields[1:])

            if "masktype" in args.keys():
                raise NotImplementedError("No support yet for NAB files (such as \"%s\") with masked NumPy arrays" % fileName)

            if set(args[fileName].keys()) != set(["formats", "names"]):
                raise BadlyFormattedInputData("NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])" % (fileName, str(set(args[fileName].keys()))))

            thisfields = args[fileName]["names"].split(",")
            thistypes = args[fileName]["formats"].split(",")
            for i in xrange(len(thistypes)):
                if thistypes[i][0] == "a":
                    thistypes[i] = "string"
                    strings[thisfields[i]] = True
                else:
                    strings[thisfields[i]] = False

            if fields is None:
                fields = thisfields
                types = thistypes
            else:
                if fields != thisfields:
                    raise IncompatibleFilesInChain("NAB file \"%s\" header has fields %s, which differ from the first %s" % (fileName, str(thisfields), str(fields)))
                if types != thistypes:
                    raise IncompatibleFilesInChain("NAB file \"%s\" header has types %s, which differ from the first %s" % (fileName, str(thistypes), str(types)))

        table = UniTable(fields, dict(zip(fields, types)))
        table.pages = []
        table.starts = []
        table.length = 0

        for fileName in fileNames:
            file = open(fileName, "rb")
            file.readline()
            data = numpy.rec.fromfile(file, **args[fileName])
            
            table.pageSize = len(data)
            page = UniPage(table.fields, table.types)

            arrays = {}
            for f in table.fields:
                arr = data.field(f)
                if strings[f]:
                    arr = [i.decode("utf-8") for i in arr]
                arrays[f] = arr

            page.initExisting(table.pageSize, arrays, copy=False, stringToCategory=True)
            table.pages.append(page)
            table.starts.append(table.length)
            table.length += len(data)

        return table

    ################################################################ XTBL
    elif format == "XTBL":
        fileNames = getfiles(fileLocation, sorter)
        if len(fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" % fileLocation)

        limitGB = parameters.get("limitGB", None)
        memoryMap = parameters.get("memoryMap", False)

        # get the footers from each file (XML) and make sure they have identical DataDictionaries
        footers = []
        for i, fileName in enumerate(fileNames):
            fileSize = os.stat(fileName).st_size
            file = open(fileName, "rb")

            file.seek(max(0, fileSize - 1024))
            text = file.read()
            m = re.search("<SeekFooter\s+byteOffset=\"([0-9]+)\"\s+/>", text)
            if m is not None:
                textStart = int(m.group(1))
            else:
                raise IOError("File \"%s\" does not have the right format (the <SeekFooter /> element was not found in the last kilobyte)" % fileName)

            file.seek(textStart)

            footer = load(file.read(), xtbl.XTBL)
            footers.append(footer)
            if len(footers) > 1:
                thisDataDictionary = footer.child(xtbl.DataDictionary)
                firstDataDictionary = footers[0].child(xtbl.DataDictionary)

                if thisDataDictionary != firstDataDictionary:
                    for x in thisDataDictionary.matches(xtbl.LookupTable, maxdepth=None) + firstDataDictionary.matches(xtbl.LookupTable, maxdepth=None):
                        x.serialize()
                    raise IncompatibleFilesInChain("XTBL file \"%s\" is incompatible with the first file \"%s\":%s%s%s%s" % (fileNames[i], fileNames[0], os.linesep, thisDataDictionary.xml(), os.linesep, firstDataDictionary.xml()))

            file.close()

        # set up the UniTable's fields, types, pages, starts, and length
        fields = []
        types = {}
        dtypes = {}
        lookups = {}

        for dataField in footers[0].child(xtbl.DataDictionary).matches(xtbl.DataField):
            field = dataField.attrib["name"]
            fields.append(field)
            types[field] = dataField.attrib["type"]
            dtypes[field] = dataField.attrib["dtype"]

            lookup = dataField.child(xtbl.LookupTable, exception=False)
            if lookup is not None:
                lookups[field] = lookup.n_to_v
            else:
                lookups[field] = None

        categories = []
        for f in fields:
            n_to_v = lookups[f]
            if n_to_v is None:
                categories.append(None)
            else:
                v_to_n = dict((v, n) for n, v in n_to_v.items())
                categories.append((v_to_n, n_to_v))

        table = UniTable(fields, types)
        table.pages = []
        table.starts = []
        table.length = 0

        uniPageDiskCacheManager = UniPageDiskCacheManager(limitGB, memoryMap)

        for i, fileName in enumerate(fileNames):
            for xtblpage in footers[i].child(xtbl.Pages).matches(xtbl.Page):
                length = xtblpage.attrib["length"]

                byteOffsets = {}
                for pageFieldOffset in xtblpage.matches(xtbl.PageFieldOffset):
                    byteOffsets[pageFieldOffset.attrib["name"]] = pageFieldOffset.attrib["byteOffset"]

                uniPage = UniPageOnDisk(fields, table.types)
                uniPage.initDisk(length, fileName, byteOffsets, dtypes, categories, uniPageDiskCacheManager)

                table.pages.append(uniPage)
                table.starts.append(table.length)
                table.length += length

        return table
Пример #10
0
def readUniTable(fileLocation,
                 format=None,
                 sorter=None,
                 pageSize=None,
                 mapInvalid=None,
                 mapMissing=None,
                 **parameters):
    format = getformat(fileLocation, format)

    ################################################################ CSV
    if format == "CSV":
        csvInput = CSVStream(fileLocation, sorter, **parameters)

        if csvInput.types is not None:
            types = csvInput.types
        else:
            types = dict((f, "string") for f in csvInput.fields)

        _mapInvalid = dict(
            (f, str("INVALID") if types[f] in ("category",
                                               "string") else -1000)
            for f in csvInput.fields)
        if mapInvalid is None:
            mapInvalid = _mapInvalid
        else:
            _mapInvalid.update(mapInvalid)
            mapInvalid = _mapInvalid

        _mapMissing = dict(
            (f, str("MISSING") if types[f] in ("category",
                                               "string") else -1000)
            for f in csvInput.fields)
        if mapMissing is None:
            mapMissing = _mapMissing
        else:
            _mapMissing.update(mapMissing)
            mapMissing = _mapMissing

        table = UniTable(csvInput.fields, types)
        table.initMemory(pageSize)

        for record in csvInput:
            table.fill([
                mapInvalid[f]
                if r is INVALID else mapMissing[f] if r is MISSING else r
                for f, r in zip(csvInput.fields, record)
            ])

        return table

    ################################################################ XML
    if format == "XML":
        xmlInput = XMLStream(fileLocation, sorter, **parameters)

        if xmlInput.types is not None:
            types = xmlInput.types
        else:
            types = dict((f, "string") for f in xmlInput.fields)

        _mapInvalid = dict(
            (f, str("INVALID") if types[f] in ("category",
                                               "string") else -1000)
            for f in xmlInput.fields)
        if mapInvalid is None:
            mapInvalid = _mapInvalid
        else:
            _mapInvalid.update(mapInvalid)
            mapInvalid = _mapInvalid

        _mapMissing = dict(
            (f, str("MISSING") if types[f] in ("category",
                                               "string") else -1000)
            for f in xmlInput.fields)
        if mapMissing is None:
            mapMissing = _mapMissing
        else:
            _mapMissing.update(mapMissing)
            mapMissing = _mapMissing

        table = UniTable(xmlInput.fields, types)
        table.initMemory(pageSize)

        for record in xmlInput:
            table.fill([
                mapInvalid[f] if r is INVALID else r
                for f, r in [(f, record.get(f, mapMissing[f]))
                             for f in xmlInput.fields]
            ])

        return table

    ################################################################ NAB
    elif format == "NAB":
        fileNames = getfiles(fileLocation, sorter)
        if len(fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" %
                          fileLocation)

        fields = None
        types = None
        strings = {}
        args = {}
        for fileName in fileNames:
            file = open(fileName, "rb")
            header = file.readline().rstrip()
            file.close()

            headerfields = header.decode("utf-8").split()
            if headerfields[0] != "RecArray":
                raise BadlyFormattedInputData(
                    "NAB file \"%s\" does not begin with 'RecArray'" %
                    fileName)

            args[fileName] = dict(
                asciistr(f).split("=") for f in headerfields[1:])

            if "masktype" in args.keys():
                raise NotImplementedError(
                    "No support yet for NAB files (such as \"%s\") with masked NumPy arrays"
                    % fileName)

            if set(args[fileName].keys()) != set(["formats", "names"]):
                raise BadlyFormattedInputData(
                    "NAB file \"%s\" headers are %s, rather than set([\"formats\", \"names\"])"
                    % (fileName, str(set(args[fileName].keys()))))

            thisfields = args[fileName]["names"].split(",")
            thistypes = args[fileName]["formats"].split(",")
            for i in xrange(len(thistypes)):
                if thistypes[i][0] == "a":
                    thistypes[i] = "string"
                    strings[thisfields[i]] = True
                else:
                    strings[thisfields[i]] = False

            if fields is None:
                fields = thisfields
                types = thistypes
            else:
                if fields != thisfields:
                    raise IncompatibleFilesInChain(
                        "NAB file \"%s\" header has fields %s, which differ from the first %s"
                        % (fileName, str(thisfields), str(fields)))
                if types != thistypes:
                    raise IncompatibleFilesInChain(
                        "NAB file \"%s\" header has types %s, which differ from the first %s"
                        % (fileName, str(thistypes), str(types)))

        table = UniTable(fields, dict(zip(fields, types)))
        table.pages = []
        table.starts = []
        table.length = 0

        for fileName in fileNames:
            file = open(fileName, "rb")
            file.readline()
            data = numpy.rec.fromfile(file, **args[fileName])

            table.pageSize = len(data)
            page = UniPage(table.fields, table.types)

            arrays = {}
            for f in table.fields:
                arr = data.field(f)
                if strings[f]:
                    arr = [i.decode("utf-8") for i in arr]
                arrays[f] = arr

            page.initExisting(table.pageSize,
                              arrays,
                              copy=False,
                              stringToCategory=True)
            table.pages.append(page)
            table.starts.append(table.length)
            table.length += len(data)

        return table

    ################################################################ XTBL
    elif format == "XTBL":
        fileNames = getfiles(fileLocation, sorter)
        if len(fileNames) == 0:
            raise IOError("No files match \"%s\" (even with wildcards)" %
                          fileLocation)

        limitGB = parameters.get("limitGB", None)
        memoryMap = parameters.get("memoryMap", False)

        # get the footers from each file (XML) and make sure they have identical DataDictionaries
        footers = []
        for i, fileName in enumerate(fileNames):
            fileSize = os.stat(fileName).st_size
            file = open(fileName, "rb")

            file.seek(max(0, fileSize - 1024))
            text = file.read()
            m = re.search("<SeekFooter\s+byteOffset=\"([0-9]+)\"\s+/>", text)
            if m is not None:
                textStart = int(m.group(1))
            else:
                raise IOError(
                    "File \"%s\" does not have the right format (the <SeekFooter /> element was not found in the last kilobyte)"
                    % fileName)

            file.seek(textStart)

            footer = load(file.read(), xtbl.XTBL)
            footers.append(footer)
            if len(footers) > 1:
                thisDataDictionary = footer.child(xtbl.DataDictionary)
                firstDataDictionary = footers[0].child(xtbl.DataDictionary)

                if thisDataDictionary != firstDataDictionary:
                    for x in thisDataDictionary.matches(
                            xtbl.LookupTable,
                            maxdepth=None) + firstDataDictionary.matches(
                                xtbl.LookupTable, maxdepth=None):
                        x.serialize()
                    raise IncompatibleFilesInChain(
                        "XTBL file \"%s\" is incompatible with the first file \"%s\":%s%s%s%s"
                        % (fileNames[i], fileNames[0], os.linesep,
                           thisDataDictionary.xml(), os.linesep,
                           firstDataDictionary.xml()))

            file.close()

        # set up the UniTable's fields, types, pages, starts, and length
        fields = []
        types = {}
        dtypes = {}
        lookups = {}

        for dataField in footers[0].child(xtbl.DataDictionary).matches(
                xtbl.DataField):
            field = dataField.attrib["name"]
            fields.append(field)
            types[field] = dataField.attrib["type"]
            dtypes[field] = dataField.attrib["dtype"]

            lookup = dataField.child(xtbl.LookupTable, exception=False)
            if lookup is not None:
                lookups[field] = lookup.n_to_v
            else:
                lookups[field] = None

        categories = []
        for f in fields:
            n_to_v = lookups[f]
            if n_to_v is None:
                categories.append(None)
            else:
                v_to_n = dict((v, n) for n, v in n_to_v.items())
                categories.append((v_to_n, n_to_v))

        table = UniTable(fields, types)
        table.pages = []
        table.starts = []
        table.length = 0

        uniPageDiskCacheManager = UniPageDiskCacheManager(limitGB, memoryMap)

        for i, fileName in enumerate(fileNames):
            for xtblpage in footers[i].child(xtbl.Pages).matches(xtbl.Page):
                length = xtblpage.attrib["length"]

                byteOffsets = {}
                for pageFieldOffset in xtblpage.matches(xtbl.PageFieldOffset):
                    byteOffsets[pageFieldOffset.attrib[
                        "name"]] = pageFieldOffset.attrib["byteOffset"]

                uniPage = UniPageOnDisk(fields, table.types)
                uniPage.initDisk(length, fileName, byteOffsets, dtypes,
                                 categories, uniPageDiskCacheManager)

                table.pages.append(uniPage)
                table.starts.append(table.length)
                table.length += length

        return table