def test_read_bom(num, prefix=""): with gzutil.GzBytesLines(TMP_FN) as fh: data = list(fh) assert data == [prefix.encode("utf-8") + b"\xef\xbb\xbfa", b"\xef\xbb\xbfb"], (num, data) with gzutil.GzBytesLines(TMP_FN, strip_bom=True) as fh: data = list(fh) assert data == [prefix.encode("utf-8") + b"a", b"\xef\xbb\xbfb"], (num, data) with gzutil.GzUnicodeLines(TMP_FN) as fh: data = list(fh) assert data == [prefix + "\ufeffa", "\ufeffb"], (num, data) with gzutil.GzUnicodeLines(TMP_FN, strip_bom=True) as fh: data = list(fh) assert data == [prefix + "a", "\ufeffb"], (num, data) with gzutil.GzUnicodeLines(TMP_FN, "latin-1") as fh: data = list(fh) assert data == [prefix.encode("utf-8").decode("latin-1") + u"\xef\xbb\xbfa", u"\xef\xbb\xbfb"], (num, data) with gzutil.GzUnicodeLines(TMP_FN, "latin-1", strip_bom=True) as fh: data = list(fh) assert data == [prefix.encode("utf-8").decode("latin-1") + u"a", u"\xef\xbb\xbfb"], (num, data) with gzutil.GzUnicodeLines(TMP_FN, "ascii", "ignore") as fh: data = list(fh) assert data == ["a", "b"], (num, data) if version_info[0] > 2: with gzutil.GzAsciiLines(TMP_FN) as fh: try: next(fh) raise Exception("GzAsciiLines allowed non-ascii in python3") except ValueError: pass
def __init__(self, *a, **kw): if 'max_count' in kw: kw['max_count'] += 1 if PY3: self.fh = gzutil.GzUnicodeLines(*a, **kw) else: self.fh = gzutil.GzBytesLines(*a, **kw)
def prepare(SOURCE_DIRECTORY): separator = options.separator assert len(separator) == 1 filename = os.path.join(SOURCE_DIRECTORY, options.filename) orig_filename = filename if filename.lower().endswith('.zip'): from zipfile import ZipFile filename = 'extracted' with ZipFile(orig_filename, 'r') as z: infos = z.infolist() assert len( infos ) == 1, 'There is only support for ZIP files with exactly one member.' # Wouldn't it be nice if ZipFile.extract let me choose the filename? with open(filename, 'wb') as ofh: zfh = z.open(infos[0]) while True: data = zfh.read(1024 * 1024) if not data: break ofh.write(data) if options.labelsonfirstline: with gzutil.GzBytesLines(filename, strip_bom=True) as fh: labels_str = next(fh).decode('ascii', 'replace').encode( 'ascii', 'replace') # garbage -> '?' if options.quote_support: labels = [] sep = options.separator while labels_str is not None: if labels_str.startswith(( '"', "'", )): q = labels_str[0] pos = 1 while pos + 1 < len(labels_str): pos = labels_str.find(q, pos) if pos == -1: # all is lost pos = len(labels_str) - 1 if pos + 1 == len(labels_str): # eol break if labels_str[pos + 1] == sep: break # we'll just assume it was a quote, because what else to do? labels_str = labels_str[:pos] + labels_str[pos + 1:] pos += 1 labels.append(labels_str[1:pos]) if len(labels_str) > pos + 1: labels_str = labels_str[pos + 2:] else: labels_str = None else: if sep in labels_str: field, labels_str = labels_str.split(sep, 1) else: field, labels_str = labels_str, None labels.append(field) else: labels = labels_str.split(options.separator) labels = options.labels or labels # only from file if not specified in options assert labels, "No labels" labels = [options.rename.get(x, x) for x in labels] assert '' not in labels, "Empty label for column %d" % (labels.index(''), ) assert len(labels) == len(set(labels)), "Duplicate labels: %r" % (labels, ) dw = DatasetWriter( columns={n: 'bytes' for n in labels}, filename=orig_filename, hashlabel=options.hashlabel, caption='csvimport of ' + orig_filename, previous=datasets.previous, meta_only=True, ) return separator, filename, orig_filename, labels, dw,
def __init__(self, *a, **kw): if PY3: self.fh = gzutil.GzUnicodeLines(*a, **kw) else: self.fh = gzutil.GzBytesLines(*a, **kw)
with open(TMP_FN, "wb") as fh: fh.write(b"\xef\xbb\xbfa\n\xef\xbb\xbfb") test_read_bom(0) with gzutil.GzWriteUnicodeLines(TMP_FN, write_bom=True) as fh: fh.write("a") fh.write("\ufeffb") test_read_bom(1) with gzutil.GzWriteUnicodeLines(TMP_FN, write_bom=True) as fh: fh.write("\ufeffa") fh.write("\ufeffb") test_read_bom(2, "\ufeff") with gzutil.GzWriteUnicodeLines(TMP_FN) as fh: fh.write("a") assert next(gzutil.GzBytesLines( TMP_FN)) == b"a", "GzWriteUnicodeLines writes BOM when not requested" print("Append test") # And finally verify appending works as expected. with gzutil.GzWriteInt64(TMP_FN) as fh: fh.write(42) with gzutil.GzWriteInt64(TMP_FN, mode="a") as fh: fh.write(18) with gzutil.GzInt64(TMP_FN) as fh: assert list(fh) == [42, 18] print("Untyped writer test") with gzutil.GzWrite(TMP_FN) as fh: class SubString(bytes): pass