def copy(src): if isinstance(src, dict): dst = OrderedDict() for k in sorted(src): dst[k] = copy(src[k]) return dst elif isinstance(src, ( list, tuple, )): return [copy(v) for v in src] elif isinstance(src, set): return [copy(v) for v in _sorted_set(src)] elif isinstance(src, datetime): return [ src.year, src.month, src.day, src.hour, src.minute, src.second, src.microsecond ] elif isinstance(src, date): return [src.year, src.month, src.day] elif isinstance(src, time): return [ 1970, 1, 1, src.hour, src.minute, src.second, src.microsecond ] elif isinstance(src, timedelta): return src.total_seconds() elif PY2 and isinstance(src, bytes): return uni(src) else: assert isinstance(src, (str, unicode, int, float, long, bool)) or src is None, type(src) return src
def namefix(d, name): ok = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz._-' name = ''.join(c if c in ok else '_' for c in uni(name)) if name == 'default' and options.chaining != 'off': name = 'default_' while name in d: name += '_' return name
def typefix(e): if isinstance(e, dict): return dict_type((typefix(k), typefix(v)) for k, v in iteritems(e)) elif isinstance(e, (list, tuple, set,)): return [typefix(v) for v in e] elif PY2 and isinstance(e, bytes): return uni(e) else: return e
def _options(self, optionsdict, title='Options'): if not optionsdict: return self.println(title) maxlen = max(len(k) for k in optionsdict) for k, v in sorted(optionsdict.items()): k = uni(k).ljust(maxlen) if isinstance(v, (list, tuple)): self.println(' %s :' % (k, )) for t in v: self.println(' %s %s' % ( ' ' * maxlen, uni(t), )) else: self.println(" %s : %s " % ( k, uni(v), ))
def printvec(self, vec, columns): spacing = 80 // columns - 6 for ix, x in enumerate(vec): self.write(' %3d %s' % ( ix, uni(x).ljust(spacing), )) if ix % columns == columns - 1: self.write('\n') if ix % columns != columns - 1: self.write('\n')
def char2int(name, empty_value, specials="empty"): char = options.get(name) if not char: return empty_value msg = "%s must be a single iso-8859-1 character (or %s)" % (name, specials,) if isinstance(char, bytes): char = uni(char) try: char = char.encode("iso-8859-1") except UnicodeEncodeError: raise Exception(msg) assert len(char) == 1, msg return cstuff.backend.char2int(char)
def __exit__(self, type, value, tb): # We don't care if an exception occured, we still want to save # the report. # But if saving the report produces an exception we want to # ignore that and re-raise the original exception (or raise # our own exception if no original exception exists). try: if tb is None: self.line() with open('report.txt', 'w', encoding='utf-8') as F: F.write(uni(self.s)) if self.stdout: print(self.s) except Exception: # This logic looks backwards, but it isn't if tb is None: raise finally: self._closed = True
def synthesis(job): check_good_file(job, "mixed line endings", b"ix,0,1\r\n1,a,a\n2,b,b\r\n3,c,c", { 1: b"a", 2: b"b", 3: b"c" }) check_good_file(job, "ignored quotes", b"ix,0,1\n1,'a,'a\n2,'b','b'\n3,\"c\",\"c\"\n4,d',d'\n", { 1: b"'a", 2: b"'b'", 3: b'"c"', 4: b"d'" }) check_good_file(job, "ignored quotes and extra fields", b"ix,0,1\n1,\"a,\"a\n2,'b,c',d\n3,d\",d\"\n", { 1: b'"a', 3: b'd"' }, allow_bad=True, d_bad={3: b"2,'b,c',d"}) check_good_file( job, "spaces and quotes", b"ix,0,1\none,a,a\ntwo, b, b\n three,c,c\n4,\"d\"\"\",d\"\n5, 'e',\" 'e'\"\n", { b"one": b"a", b"two": b" b", b" three": b"c", 4: b'd"', 5: b" 'e'" }, quotes=True) check_good_file(job, "empty fields", b"ix,0,1\n1,,''\n2,,\n3,'',\n4,\"\",", { 1: b"", 2: b"", 3: b"", 4: b"" }, quotes=True) check_good_file(job, "renamed fields", b"0,1,2\n0,foo,foo", {0: b"foo"}, rename={ "0": "ix", "2": "0" }) check_good_file(job, "discarded field", b"ix,0,no,1\n0,yes,no,yes\n1,a,'foo,bar',a", { 0: b"yes", 1: b"a" }, quotes=True, discard={"no"}) check_good_file( job, "bad quotes", b"""ix,0,1\n1,a,a\n2,"b,"b\n\n3,'c'c','c'c'\n4,"d",'d'\n""", { 1: b"a", 4: b"d" }, quotes=True, allow_bad=True, d_bad={ 3: b'2,"b,"b', 4: b"", 5: b"3,'c'c','c'c'" }) check_good_file(job, "comments", b"""# blah\nix,0,1\n1,a,a\n2,b,b\n#3,c,c\n4,#d,#d\n""", { 1: b"a", 2: b"b", 4: b"#d" }, comment="#", d_skipped={ 1: b"# blah", 5: b"#3,c,c" }) check_good_file(job, "not comments", b"""ix,0,1\n1,a,a\n2,b,b\n#3,c,c\n4,#d,#d\n""", { 1: b"a", 2: b"b", b"#3": b"c", 4: b"#d" }) check_good_file( job, "a little of everything", b""";not,1,labels\na,2,1\n;a,3,;a\n";b",4,;b\n'c,5,c'\r\n d,6,' d'\ne,7,e,\n,8,""", { 4: b";b", 6: b" d", 8: b"" }, allow_bad=True, rename={ "a": "0", "2": "ix" }, quotes=True, comment=";", d_bad={ 5: b"'c,5,c'", 7: b"e,7,e," }, d_skipped={ 1: b";not,1,labels", 3: b";a,3,;a" }) check_good_file(job, "skipped lines", b"""just some text\n\nix,0,1\n1,a,a\n2,b,b""", { 1: b"a", 2: b"b" }, skip_lines=2, d_skipped={ 1: b"just some text", 2: b"" }) check_good_file(job, "skipped and bad lines", b"""not data here\nnor here\nix,0,1\n1,a,a\n2,b\n3,c,c""", { 1: b"a", 3: b"c" }, skip_lines=2, allow_bad=True, d_bad={5: b"2,b"}, d_skipped={ 1: b"not data here", 2: b"nor here" }) check_good_file(job, "override labels", b"""a,b,c\n0,foo,foo""", {0: b"foo"}, labels=["ix", "0", "1"]) check_good_file(job, "only labels", b"""ix,0,1""", {}) check_good_file(job, "empty file", b"", {}, labels=["ix", "0", "1"]) check_good_file(job, "lineno with bad lines", b"ix,0,1\n2,a,a\n3,b\nc\n5,d,d\n6,e,e\n7\n8,g,g\n\n", { 2: b"a", 5: b"d", 6: b"e", 8: b"g" }, d_bad={ 3: b"3,b", 4: b"c", 7: b"7", 9: b"" }, allow_bad=True, lineno_label="num") check_good_file(job, "lineno with skipped lines", b"a\nb\n3,c,c\n4,d,d", { 3: b"c", 4: b"d" }, lineno_label="l", labels=["ix", "0", "1"], labelsonfirstline=False, skip_lines=2, d_skipped={ 1: b"a", 2: b"b" }) check_good_file(job, "lineno with comment lines", b"ix,0,1\n2,a,a\n3,b,b\n#4,c,c\n5,d,d", { 2: b"a", 3: b"b", 5: b"d" }, lineno_label="another name", comment="#", d_skipped={4: b"#4,c,c"}) check_good_file(job, "strip labels", b" ix , 0 , 1 \n1,a,a\n2,b ,b ", { 1: b"a", 2: b"b " }, strip_labels=True) check_good_file(job, "allow extra empty", b"ix,0,1,,,,\n1,a,a\n2,b,b,,\n3,,,", { 1: b"a", 2: b"b", 3: b"" }, allow_extra_empty=True) check_good_file(job, "allow extra empty quoted", b"ix,_0_,1,,,__,\n1,a,a\n_2_,b,b,__,\n3,c,c,__", { 1: b"a", 2: b"b", 3: b"c" }, allow_extra_empty=True, quotes='_') check_good_file( job, "allow extra empty quoted bad", b"ix,0,1,,,'',\"\"\n1,a,a\n'2',b,b,'',\n3,c,c,\"\"\n4,d,d,'\"\n5,'',\"\",'", { 1: b"a", 2: b"b", 3: b"c" }, allow_extra_empty=True, quotes=True, allow_bad=True, d_bad={ 5: b"4,d,d,'\"", 6: b"5,'',\"\",'" }) check_good_file(job, "skip empty lines", b"\nix,0,1\n\n\n1,a,a\n", {1: b"a"}, skip_empty_lines=True) check_good_file(job, "skip empty lines and comments", b"\r\nix,0,1\n\n\n5,a,a\n#6,b,b\n7,c,c\n#", { 5: b"a", 7: b"c" }, skip_empty_lines=True, comment="#", d_skipped={ 1: b"", 3: b"", 4: b"", 6: b"#6,b,b", 8: b"#" }, lineno_label="line") check_good_file(job, "skip empty lines and bad", b"\n\nix,0,1\n4,a,a\n \n6,b,b\n\r\n", { 4: b"a", 6: b"b" }, skip_empty_lines=True, comment="#", d_skipped={ 1: b"", 2: b"", 7: b"" }, d_bad={5: b" "}, allow_bad=True, lineno_label="line") bad_lines = [ b"bad,bad", b",", b"bad,", b",bad", b"',',", b"'lo there broken line", b"'nope\"", b"'bad quotes''", b'"bad quote " inside"', b'"more ""bad"" quotes """ inside"', ] good_lines = [ b"\x00", (b"'good, good'", b"good, good"), (b'"also good, yeah!"', b"also good, yeah!"), (b"'single quote''s inside'", b"single quote's inside"), (b"'single quote at end: '''", b"single quote at end: '"), (b'"""double quotes around"""', b'"double quotes around"'), (b'"double quote at end: """', b'double quote at end: "'), (b'" I\'m special "', b" I'm special "), b"I'm not", b" unquoted but with spaces around ", (b"','", b","), b"\x00\xff", b"\xff\x00\x08\x00", (b"'lot''s of ''quotes'' around here: '''''''' '", b"lot's of 'quotes' around here: '''' ") ] check_array(job, good_lines, "strange values.txt", bad_lines, quotes=True) # The lines will be 2 * length + 3 bytes (plus lf) long_lines = [ b"a" * length for length in (64 * 1024 - 2, 999, 999, 1999, 3000, 65000, 8 * 1024 * 1024 - 99) ] check_array(job, long_lines, "long lines.txt") check_bad_file(job, "extra field", b"foo,bar\nwith,extra,field\nok,here\n") check_bad_file(job, "missing field", b"foo,bar\nmissing\nok,here\n") check_bad_file(job, "no valid lines", b"foo\nc,\n") # let's also check some really idiotic combinations for combo in permutations([0, 10, 13, 255], 3): name = "idiotic.%d.%d.%d" % combo sep, newline, comment = (uni(chr(x)) for x in combo) data = [ comment, sep.join(["ix", "0", "1"]), sep.join(["0", "a", "a"]), sep.join([comment + "1", "b", "b"]), sep.join(["2", "", ""]), comment + sep, sep.join(["", "", ""]), sep.join(["4", ",", ","]), comment, ] check_good_file( job, name, data=newline.join(data).encode("iso-8859-1"), d={ 0: b"a", 2: b"", b"": b"", 4: b"," }, d_skipped={ k: data[k - 1].encode("iso-8859-1") for k in (1, 4, 6, 9) }, separator=sep, newline=newline, comment=comment, ) check_no_separator(job)
def check_one(job, newline, sep, data, want_res=None, prefix="", quotes=False, leave_bad=False): sep_c = uni(chr(sep)) # Can't have separator character in unquoted values if not quotes and not leave_bad: data = [[el.replace(sep_c, "") for el in line] for line in data] if not want_res: want_res = [ tuple(s.encode("ascii") for s in line) for line in data[1:] ] filename = "%s_csv.%d.%s.txt" % (prefix, sep, "CRLF" if newline == "\r\n" else ord(newline)) newline = uni(newline) with job.open(filename, "w", encoding="iso-8859-1", temp=True) as fh: for line in data: if quotes: line = [ quotes + el.replace(quotes, quotes + quotes) + quotes for el in line ] fh.write(sep_c.join(line)) fh.write(newline) try: jid = subjobs.build("csvimport", options=dict( filename=job.filename(filename), separator=sep_c, quotes=quotes, newline='' if "\n" in newline else newline, )) except JobError as e: raise CSVImportException( "Failed to csvimport for separator %d with newline %r, csvimport error was:\n%s" % (sep, newline, e.format_msg())) ds = Dataset(jid) labels = sorted(ds.columns) if labels != data[0]: raise WrongLabelsException( "csvimport gave wrong labels for separator %d with newline %r: %r (expected %r)" % ( sep, newline, labels, data[0], )) res = list(ds.iterate(None, data[0])) if res != want_res: raise WrongDataException( "csvimport gave wrong data for separator %d with newline %r: %r (expected %r)" % ( sep, newline, res, want_res, ))
def write(self, s): assert not self._closed, 'Closed.' self.s += uni(s)