Exemplo n.º 1
0
    def link_to_here(self, name='default', column_filter=None):
        """Use this to expose a subjob as a dataset in your job:
		Dataset(subjid).link_to_here()
		will allow access to the subjob dataset under your jid.
		Specify column_filter as an iterable of columns to include
		if you don't want all of them."""
        if column_filter:
            column_filter = set(column_filter)
            filtered_columns = {
                k: v
                for k, v in self._data.columns.items() if k in column_filter
            }
            left_over = column_filter - set(filtered_columns)
            assert not left_over, "Columns in filter not available in dataset: %r" % (
                left_over, )
            assert filtered_columns, "Filter produced no desired columns."
            self._data.columns = filtered_columns
        from g import JOBID
        self._data.parent = '%s/%s' % (
            self.jobid,
            self.name,
        )
        self.jobid = uni(JOBID)
        self.name = uni(name)
        self._save()
Exemplo n.º 2
0
	def append(self, columns, filenames, lines, minmax={}, filename=None, hashlabel=None, hashlabel_override=False, caption=None, previous=None, name='default'):
		if hashlabel:
			hashlabel = uni(hashlabel)
			if not hashlabel_override:
				assert self.hashlabel == hashlabel, 'Hashlabel mismatch %s != %s' % (self.hashlabel, hashlabel,)
		assert self._linefixup(lines) == self.lines, "New columns don't have the same number of lines as parent columns"
		columns = {uni(k): uni(v) for k, v in columns.items()}
		self._append(columns, filenames, minmax, filename, caption, previous, name)
Exemplo n.º 3
0
 def __new__(cls, jobid, name=None):
     if isinstance(jobid, (tuple, list)):
         jobid = _dsid(jobid)
     elif isinstance(jobid, dict):
         assert not name, "Don't pass both a separate name and jobid as {job: dataset}"
         assert len(jobid) == 1, "Only pass a single {job: dataset}"
         jobid, dsname = next(iteritems(jobid))
         if not jobid:
             return None
         jobid = job_params(jobid, default_empty=True).datasets.get(dsname)
         if not jobid:
             return None
     if '/' in jobid:
         assert not name, "Don't pass both a separate name and jobid as jid/name"
         jobid, name = jobid.split('/', 1)
     assert jobid, "If you really meant to use yourself as a dataset, pass params.jobid explicitly."
     name = uni(name or 'default')
     assert '/' not in name
     if name == 'default':
         suffix = ''
     else:
         suffix = '/' + name
     if jobid is _new_dataset_marker:
         from g import JOBID
         fullname = JOBID + suffix
     else:
         fullname = jobid + suffix
     obj = unicode.__new__(cls, fullname)
     obj.name = uni(name or 'default')
     if jobid is _new_dataset_marker:
         obj._data = DotDict({
             'version': (
                 2,
                 2,
             ),
             'filename': None,
             'hashlabel': None,
             'caption': '',
             'columns': {},
             'parent': None,
             'previous': None,
             'lines': [],
         })
         obj.jobid = None
     else:
         obj.jobid = jobid
         obj._data = DotDict(_ds_load(obj))
         assert obj._data.version[0] == 2 and obj._data.version[
             1] >= 2, "%s/%s: Unsupported dataset pickle version %r" % (
                 jobid,
                 name,
                 obj._data.version,
             )
         obj._data.columns = dict(obj._data.columns)
     return obj
Exemplo n.º 4
0
def _dsid(t):
    if not t:
        return None
    if isinstance(t, (tuple, list)):
        jid, name = t
        if not jid:
            return None
        t = '%s/%s' % (jid.split('/')[0], uni(name) or 'default')
    if '/' not in t:
        t += '/default'
    return uni(t)
Exemplo n.º 5
0
	def new(columns, filenames, lines, minmax={}, filename=None, hashlabel=None, caption=None, previous=None, name='default'):
		"""columns = {"colname": "type"}, lines = [n, ...] or {sliceno: n}"""
		columns = {uni(k): uni(v) for k, v in columns.items()}
		if hashlabel:
			hashlabel = uni(hashlabel)
			assert hashlabel in columns, hashlabel
		res = Dataset(_new_dataset_marker, name)
		res._data.lines = list(Dataset._linefixup(lines))
		res._data.hashlabel = hashlabel
		res._append(columns, filenames, minmax, filename, caption, previous, name)
		return res
Exemplo n.º 6
0
    def __new__(cls,
                columns={},
                filename=None,
                hashlabel=None,
                hashlabel_override=False,
                caption=None,
                previous=None,
                name='default',
                parent=None,
                meta_only=False,
                for_single_slice=None):
        """columns can be {'name': 'type'} or {'name': DatasetColumn}
		to simplify basing your dataset on another."""
        name = uni(name)
        assert '/' not in name, name
        from g import running
        if running == 'analysis':
            assert name in _datasetwriters, 'Dataset with name "%s" not created' % (
                name, )
            assert not columns and not filename and not hashlabel and not caption and not parent and for_single_slice is None, "Don't specify any arguments (except optionally name) in analysis"
            return _datasetwriters[name]
        else:
            assert name not in _datasetwriters, 'Duplicate dataset name "%s"' % (
                name, )
            os.mkdir(name)
            obj = object.__new__(cls)
            obj._running = running
            obj.filename = uni(filename)
            obj.hashlabel = uni(hashlabel)
            obj.hashlabel_override = hashlabel_override,
            obj.caption = uni(caption)
            obj.previous = _dsid(previous)
            obj.name = uni(name)
            obj.parent = _dsid(parent)
            obj.columns = {}
            obj.meta_only = meta_only
            obj._for_single_slice = for_single_slice
            obj._clean_names = {}
            if parent:
                obj._pcolumns = Dataset(parent).columns
                obj._seen_n = set(c.name for c in obj._pcolumns.values())
            else:
                obj._pcolumns = {}
                obj._seen_n = set()
            obj._started = False
            obj._lens = {}
            obj._minmax = {}
            obj._order = []
            for k, v in sorted(columns.items()):
                if isinstance(v, tuple):
                    v = v.type
                obj.add(k, v)
            _datasetwriters[name] = obj
            return obj
Exemplo n.º 7
0
 def add(self, colname, coltype, default=_nodefault):
     from g import running
     assert running == self._running, "Add all columns in the same step as creation"
     assert not self._started, "Add all columns before setting slice"
     colname = uni(colname)
     coltype = uni(coltype)
     assert colname not in self.columns, colname
     assert colname
     typed_writer(coltype)  # gives error for unknown types
     self.columns[colname] = (coltype, default)
     self._order.append(colname)
     if colname in self._pcolumns:
         self._clean_names[colname] = self._pcolumns[colname].name
     else:
         self._clean_names[colname] = _clean_name(colname, self._seen_n)
Exemplo n.º 8
0
	def _append(self, columns, filenames, minmax, filename, caption, previous, name):
		from sourcedata import type2iter
		from g import JOBID
		jobid = uni(JOBID)
		name = uni(name)
		filenames = {uni(k): uni(v) for k, v in filenames.items()}
		assert set(columns) == set(filenames), "columns and filenames don't have the same keys"
		if self.jobid and (self.jobid != jobid or self.name != name):
			self._data.parent = '%s/%s' % (self.jobid, self.name,)
		self.jobid = jobid
		self.name = name
		self._data.filename = uni(filename) or self._data.filename or None
		self._data.caption  = uni(caption) or self._data.caption or jobid
		self._data.previous = _dsid(previous)
		for n in ('cache', 'cache_distance'):
			if n in self._data: del self._data[n]
		minmax = self._minmax_merge(minmax)
		for n, t in sorted(columns.items()):
			if t not in type2iter:
				raise Exception('Unknown type %s on column %s' % (t, n,))
			mm = minmax.get(n, (None, None,))
			self._data.columns[n] = DatasetColumn(
				type=uni(t),
				name=filenames[n],
				location='%s/%s/%%s.%s' % (jobid, self.name, filenames[n]),
				min=mm[0],
				max=mm[1],
				offsets=None,
			)
			self._maybe_merge(n)
		self._update_caches()
		self._save()
Exemplo n.º 9
0
	def typefix(e):
		if isinstance(e, dict):
			return dict_type((typefix(k), typefix(v)) for k, v in iteritems(e))
		elif isinstance(e, (list, tuple, set,)):
			return [typefix(v) for v in e]
		elif PY2 and isinstance(e, bytes):
			return uni(e)
		else:
			return e
Exemplo n.º 10
0
 def _options(self, optionsdict, title='Options'):
     if not optionsdict:
         return
     self.println(title)
     maxlen = max(len(k) for k in optionsdict)
     for k, v in sorted(optionsdict.items()):
         k = uni(k).ljust(maxlen)
         if isinstance(v, (list, tuple)):
             self.println('  %s :' % (k, ))
             for t in v:
                 self.println('  %s   %s' % (
                     ' ' * maxlen,
                     uni(t),
                 ))
         else:
             self.println("  %s : %s " % (
                 k,
                 uni(v),
             ))
Exemplo n.º 11
0
 def printvec(self, vec, columns):
     spacing = 80 // columns - 6
     for ix, x in enumerate(vec):
         self.write('  %3d %s' % (
             ix,
             uni(x).ljust(spacing),
         ))
         if ix % columns == columns - 1:
             self.write('\n')
     if ix % columns != columns - 1:
         self.write('\n')
Exemplo n.º 12
0
    def link_to_here(self,
                     name='default',
                     column_filter=None,
                     override_previous=_no_override):
        """Use this to expose a subjob as a dataset in your job:
		Dataset(subjid).link_to_here()
		will allow access to the subjob dataset under your jid.
		Specify column_filter as an iterable of columns to include
		if you don't want all of them.
		Use override_previous to rechain (or unchain) the dataset.
		"""
        d = Dataset(self)
        if column_filter:
            column_filter = set(column_filter)
            filtered_columns = {
                k: v
                for k, v in d._data.columns.items() if k in column_filter
            }
            left_over = column_filter - set(filtered_columns)
            assert not left_over, "Columns in filter not available in dataset: %r" % (
                left_over, )
            assert filtered_columns, "Filter produced no desired columns."
            d._data.columns = filtered_columns
        from g import JOBID
        if override_previous is not _no_override:
            override_previous = _dsid(override_previous)
            if override_previous:
                # make sure it's valid
                Dataset(override_previous)
            d._data.previous = override_previous
            d._update_caches()
        d._data.parent = '%s/%s' % (
            d.jobid,
            d.name,
        )
        d.jobid = uni(JOBID)
        d.name = uni(name)
        d._save()
Exemplo n.º 13
0
 def __exit__(self, type, value, tb):
     # We don't care if an exception occured, we still want to save
     # the report.
     # But if saving the report produces an exception we want to
     # ignore that and re-raise the original exception (or raise
     # our own exception if no original exception exists).
     try:
         if tb is None:
             self.line()
         with open('report.txt', 'w', encoding='utf-8') as F:
             F.write(uni(self.s))
         if self.stdout:
             print(self.s)
     except Exception:
         # This logic looks backwards, but it isn't
         if tb is None:
             raise
     finally:
         self._closed = True
Exemplo n.º 14
0
 def close(self):
     self.line()
     with open('report.txt', 'w', encoding='utf-8') as F:
         F.write(uni(self.s))
     if self.stdout:
         print(self.s)
Exemplo n.º 15
0
 def write(self, s):
     assert not self._closed, 'Closed.'
     self.s += uni(s)
Exemplo n.º 16
0
def synthesis(params):
    check_good_file(params, "mixed line endings",
                    b"ix,0,1\r\n1,a,a\n2,b,b\r\n3,c,c", {
                        1: b"a",
                        2: b"b",
                        3: b"c"
                    })
    check_good_file(params, "ignored quotes",
                    b"ix,0,1\n1,'a,'a\n2,'b','b'\n3,\"c\",\"c\"\n4,d',d'\n", {
                        1: b"'a",
                        2: b"'b'",
                        3: b'"c"',
                        4: b"d'"
                    })
    check_good_file(params,
                    "ignored quotes and extra fields",
                    b"ix,0,1\n1,\"a,\"a\n2,'b,c',d\n3,d\",d\"\n", {
                        1: b'"a',
                        3: b'd"'
                    },
                    allow_bad=True,
                    d_bad={3: b"2,'b,c',d"})
    check_good_file(
        params,
        "spaces and quotes",
        b"ix,0,1\none,a,a\ntwo, b, b\n three,c,c\n4,\"d\"\"\",d\"\n5, 'e',\" 'e'\"\n",
        {
            b"one": b"a",
            b"two": b" b",
            b" three": b"c",
            4: b'd"',
            5: b" 'e'"
        },
        quotes=True)
    check_good_file(params,
                    "empty fields",
                    b"ix,0,1\n1,,''\n2,,\n3,'',\n4,\"\",", {
                        1: b"",
                        2: b"",
                        3: b"",
                        4: b""
                    },
                    quotes=True)
    check_good_file(params,
                    "renamed fields",
                    b"0,1,2\n0,foo,foo", {0: b"foo"},
                    rename={
                        "0": "ix",
                        "2": "0"
                    })
    check_good_file(params,
                    "discarded field",
                    b"ix,0,no,1\n0,yes,no,yes\n1,a,'foo,bar',a", {
                        0: b"yes",
                        1: b"a"
                    },
                    quotes=True,
                    discard={"no"})
    check_good_file(
        params,
        "bad quotes",
        b"""ix,0,1\n1,a,a\n2,"b,"b\n\n3,'c'c','c'c'\n4,"d",'d'\n""", {
            1: b"a",
            4: b"d"
        },
        quotes=True,
        allow_bad=True,
        d_bad={
            3: b'2,"b,"b',
            4: b"",
            5: b"3,'c'c','c'c'"
        })
    check_good_file(params,
                    "comments",
                    b"""# blah\nix,0,1\n1,a,a\n2,b,b\n#3,c,c\n4,#d,#d\n""", {
                        1: b"a",
                        2: b"b",
                        4: b"#d"
                    },
                    comment="#",
                    d_skipped={
                        1: b"# blah",
                        5: b"#3,c,c"
                    })
    check_good_file(params, "not comments",
                    b"""ix,0,1\n1,a,a\n2,b,b\n#3,c,c\n4,#d,#d\n""", {
                        1: b"a",
                        2: b"b",
                        b"#3": b"c",
                        4: b"#d"
                    })
    check_good_file(
        params,
        "a little of everything",
        b""";not,1,labels\na,2,1\n;a,3,;a\n";b",4,;b\n'c,5,c'\r\n d,6,' d'\ne,7,e,\n,8,""",
        {
            4: b";b",
            6: b" d",
            8: b""
        },
        allow_bad=True,
        rename={
            "a": "0",
            "2": "ix"
        },
        quotes=True,
        comment=";",
        d_bad={
            5: b"'c,5,c'",
            7: b"e,7,e,"
        },
        d_skipped={
            1: b";not,1,labels",
            3: b";a,3,;a"
        })
    check_good_file(params,
                    "skipped lines",
                    b"""just some text\n\nix,0,1\n1,a,a\n2,b,b""", {
                        1: b"a",
                        2: b"b"
                    },
                    skip_lines=2,
                    d_skipped={
                        1: b"just some text",
                        2: b""
                    })
    check_good_file(params,
                    "skipped and bad lines",
                    b"""not data here\nnor here\nix,0,1\n1,a,a\n2,b\n3,c,c""",
                    {
                        1: b"a",
                        3: b"c"
                    },
                    skip_lines=2,
                    allow_bad=True,
                    d_bad={5: b"2,b"},
                    d_skipped={
                        1: b"not data here",
                        2: b"nor here"
                    })
    check_good_file(params,
                    "override labels",
                    b"""a,b,c\n0,foo,foo""", {0: b"foo"},
                    labels=["ix", "0", "1"])
    check_good_file(params, "only labels", b"""ix,0,1""", {})
    check_good_file(params, "empty file", b"", {}, labels=["ix", "0", "1"])

    bad_lines = [
        b"bad,bad",
        b",",
        b"bad,",
        b",bad",
        b"',',",
        b"'lo there broken line",
        b"'nope\"",
        b"'bad quotes''",
        b'"bad quote " inside"',
        b'"more ""bad"" quotes """ inside"',
    ]
    good_lines = [
        b"\x00", (b"'good, good'", b"good, good"),
        (b'"also good, yeah!"', b"also good, yeah!"),
        (b"'single quote''s inside'", b"single quote's inside"),
        (b"'single quote at end: '''", b"single quote at end: '"),
        (b'"""double quotes around"""', b'"double quotes around"'),
        (b'"double quote at end: """', b'double quote at end: "'),
        (b'" I\'m special "', b" I'm special "), b"I'm not",
        b" unquoted but with spaces around ", (b"','", b","), b"\x00\xff",
        b"\xff\x00\x08\x00",
        (b"'lot''s of ''quotes'' around here: '''''''' '",
         b"lot's of 'quotes' around here: '''' ")
    ]
    check_array(params,
                good_lines,
                "strange values.txt",
                bad_lines,
                quotes=True)
    # The lines will be 2 * length + 3 bytes (plus lf)
    long_lines = [
        b"a" * length for length in (64 * 1024 - 2, 999, 999, 1999, 3000,
                                     65000, 8 * 1024 * 1024 - 99)
    ]
    check_array(params, long_lines, "long lines.txt")
    check_bad_file(params, "extra field",
                   b"foo,bar\nwith,extra,field\nok,here\n")
    check_bad_file(params, "missing field", b"foo,bar\nmissing\nok,here\n")
    check_bad_file(params, "no valid lines", b"foo\nc,\n")

    # let's also check some really idiotic combinations
    for combo in permutations([0, 10, 13, 255], 3):
        name = "idiotic.%d.%d.%d" % combo
        sep, newline, comment = (uni(chr(x)) for x in combo)
        data = [
            comment,
            sep.join(["ix", "0", "1"]),
            sep.join(["0", "a", "a"]),
            sep.join([comment + "1", "b", "b"]),
            sep.join(["2", "", ""]),
            comment + sep,
            sep.join(["", "", ""]),
            sep.join(["4", ",", ","]),
            comment,
        ]
        check_good_file(
            params,
            name,
            data=newline.join(data).encode("iso-8859-1"),
            d={
                0: b"a",
                2: b"",
                b"": b"",
                4: b","
            },
            d_skipped={
                k: data[k - 1].encode("iso-8859-1")
                for k in (1, 4, 6, 9)
            },
            separator=sep,
            newline=newline,
            comment=comment,
        )

    check_no_separator(params)
Exemplo n.º 17
0
def do_one(params, name, data):
	dw = DatasetWriter(name=name, columns=columns)
	dw.set_slice(0)
	for v in data:
		if v is None:
			d = dict(
				ascii_new=None,
				ascii_old=None,
				bytes_new=None,
				bytes_old=None,
				unicode_new=None,
				unicode_old=None,
			)
		else:
			d = dict(
				ascii_new=v,
				ascii_old=v,
				bytes_new=uni(v).encode("ascii"),
				bytes_old=uni(v).encode("ascii"),
				unicode_new=uni(v),
				unicode_old=uni(v),
			)
		dw.write_dict(d)
	# We don't really want the other slices, but write one thing to
	# each, to make sure it doesn't show up in slice 0.
	# (Small slice merging will put it in the same file, so this is
	# a real risk.)
	for sliceno in range(1, params.slices):
		dw.set_slice(sliceno)
		dw.write_dict(d)
	dw.finish()

	# verify we got what we asked for
	me_ds = Dataset(params.jobid, name)
	for colname, coltype in columns.items():
		col = me_ds.columns[colname]
		assert col.type == coltype.split("_")[-1], colname
		assert col.backing_type == coltype, colname
		for want, got in zip(data, me_ds.iterate(0, colname)):
			if want is not None:
				if PY2 and "unicode" in coltype:
					want = uni(want)
				if PY3 and "bytes" in coltype:
					want = want.encode("ascii")
			assert want == got, "%s in %s did not contain the expected value. Wanted %r but got %r." % (colname, me_ds, want, got)

	# check that both types of bytes filter correctly through typing
	jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict(
		column2type=dict(
			ascii_new="bytes",
			ascii_old="number", # fails on the string, so that gets filtered out everywhere
			bytes_new="bytes",
			bytes_old="bytes",
		),
		filter_bad=True,
	))
	ds = Dataset(jid)
	# verify the number first
	data_it = iter(raw_data)
	next(data_it) # skip the filtered out string
	for got in ds.iterate(0, "ascii_old"):
		want = next(data_it)
		if want is None:
			# Becomes 0 because the typer (unfortunately) sees it as an empty string
			want = 0
		assert want == got, "ascii_old in %s did not type correctly as number. Wanted %r but got %r." % (ds, want, got)
	# now verify all the bytes ones are ok, no longer containing the string.
	for colname in ("ascii_new", "bytes_new", "bytes_old",):
		data_it = iter(data)
		next(data_it) # skip the filtered out string
		for got in ds.iterate(0, colname):
			want = next(data_it)
			if want is not None:
				want = want.encode("ascii")
			assert want == got, "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)

	# and now check that the Nones are ok after making bytes from ascii and unicode from bytes.
	jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict(
		column2type=dict(
			ascii_new="bytes",
			ascii_old="bytes",
			bytes_new="unicode:ascii",
			bytes_old="unicode:ascii",
		),
	))
	ds = Dataset(jid)
	for colname in ("ascii_new", "ascii_old", "bytes_new", "bytes_old",):
		for want, got in ds.iterate(0, ["unicode_new", colname]):
			assert uni(want) == uni(got), "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)
Exemplo n.º 18
0
def _ds_load(obj):
    n = unicode(obj)
    if n not in _ds_cache:
        _ds_cache[n] = _v2_columntypefix(
            blob.load(obj._name('pickle'), obj.jobid))
        _ds_cache.update(_ds_cache[n].get('cache', ()))
    return _ds_cache[n]


_type_v2to3backing = dict(
    ascii="_v2_ascii",
    bytes="_v2_bytes",
    unicode="_v2_unicode",
    json="_v2_json",
)
_type_v2compattov3t = {v: uni(k) for k, v in _type_v2to3backing.items()}


def _dc_v2to3(dc):
    return _DatasetColumn_3_0(
        type=dc.type,
        backing_type=_type_v2to3backing.get(dc.type, dc.type),
        name=dc.name,
        location=dc.location,
        min=dc.min,
        max=dc.max,
        offsets=dc.offsets,
    )


def _v2_columntypefix(ds):
Exemplo n.º 19
0
def namefix(d, name):
    ok = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.'
    name = ''.join(c if c in ok else '_' for c in uni(name))
    while name in d:
        name += '_'
    return name
def check_one(params,
              newline,
              sep,
              data,
              want_res=None,
              prefix="",
              quotes=False,
              leave_bad=False):
    sep_c = uni(chr(sep))
    # Can't have separator character in unquoted values
    if not quotes and not leave_bad:
        data = [[el.replace(sep_c, "") for el in line] for line in data]
    if not want_res:
        want_res = [
            tuple(s.encode("ascii") for s in line) for line in data[1:]
        ]
    filename = "%s_csv.%d.%s.txt" % (prefix, sep, "CRLF"
                                     if newline == "\r\n" else ord(newline))
    newline = uni(newline)
    with open(filename, "w", encoding="iso-8859-1") as fh:
        for line in data:
            if quotes:
                line = [
                    quotes + el.replace(quotes, quotes + quotes) + quotes
                    for el in line
                ]
            fh.write(sep_c.join(line))
            fh.write(newline)
    try:
        jid = subjobs.build("csvimport",
                            options=dict(
                                filename=resolve_jobid_filename(
                                    params.jobid, filename),
                                separator=sep_c,
                                quotes=quotes,
                                newline='' if "\n" in newline else newline,
                            ))
    except JobError as e:
        raise CSVImportException(
            "Failed to csvimport for separator %d with newline %r, csvimport error was:\n%s"
            % (sep, newline, e.format_msg()))
    ds = Dataset(jid)
    labels = sorted(ds.columns)
    if labels != data[0]:
        raise WrongLabelsException(
            "csvimport gave wrong labels for separator %d with newline %r: %r (expected %r)"
            % (
                sep,
                newline,
                labels,
                data[0],
            ))
    res = list(ds.iterate(None, data[0]))
    if res != want_res:
        raise WrongDataException(
            "csvimport gave wrong data for separator %d with newline %r: %r (expected %r)"
            % (
                sep,
                newline,
                res,
                want_res,
            ))