def to_cbor(json_file, output_file): print('Converting to cbor') with open(json_file, 'r') as fi: with open(output_file, 'wb') as fo: dic = json.load(fi) dic_mapped = replace_dict_key_recursively(dic, JSON_TO_CBOR_MAPPER) cbor.dump(dic_mapped, fo)
def add(self, item): '''add `item` to `current_file`, opening it as temporary file if not already open. This also constructs the `current_file_path` when it opens the temporary file. ''' if self.current_file is None: ## construct a final path to which this fil will be moved ## when it rolls self.current_file_path = os.path.join( self.output_dir, 'trec-dd-local-politics-%d.cbor' % self.total_written) if self.compress: self.current_file = gzip.open(self.tmp_file_path, 'wb') self.current_file_path += '.gz' else: self.current_file = open(self.tmp_file_path, 'wb') ## write the data cbor.dump(item, self.current_file) ## roll the files each time we reach max_chunk_size self.total_written += 1 if self.total_written % self.max_chunk_size == 0: self.roll()
def main(n): for i in range(n): rec = {'url': 'http://{}.localdomain/{}'.format(i, i)} k, v = identifier(i) rec[k] = [v] dump(rec, stdout)
def do_status(self, args): logfile = None try: while True: they = [] for ws in self.task_master.iter_work_specs(): name = ws['name'] counts = self.task_master.count_work_units(name) they.append({'name': name, 'data': ws, 'counts': counts}) they.sort(key=lambda x: x['name']) if args.logfile: record = {'time': time.time(), 'ws': they} if logfile is None: logfile = open(args.logfile, 'ab') cbor.dump(record, logfile) logfile.flush() else: # write json text to stdout self.stdout.write(json.dumps(they) + '\n') self.stdout.flush() if (((args.repeat_seconds is None) or not (args.repeat_seconds > 0.0))): break time.sleep(args.repeat_seconds) finally: if logfile is not None: logfile.close()
def save_variables(filename='demo_vars'): global mode_library outfile = open(filename, 'wb') variable_names = [ variable for variable in dir(mode_library) if not variable.startswith('__') ] variables = {} for variable_name in variable_names: variable = getattr(mode_library, variable_name) if not callable(variable) and not isinstance( variable, ModuleType) and (type(variable) in [ int, bool, str, str, dict, list ]): try: variables[variable_name] = variable except TypeError: print(" Can't encode with cbor:", copy) try: cbor.dump(variables, outfile) except Exception: print(" Error during save...") outfile.close()
def dump(self, key, conversation_id, payload): if self.save: filename = f"{key}.{conversation_id}.cbor" if os.path.exists(filename): filename = f"{key}.{conversation_id}.2.cbor" with open(filename, "wb") as output: cbor.dump(payload, output) print("Payload saved as", filename) else: print("Payload:") pprint.pprint(payload)
def write(self, kind: FileType) -> None: fn = self._path(kind) if kind == FileType.PICKLE: # serialize as TreeNode with open(fn, "wb") as f: pickle.dump(self.treenode, f, protocol=-1) elif kind == FileType.CSV: # serialize as id_dict with open(fn, "w") as f: w = csv.DictWriter(f, Node._fields) w.writeheader() for item in self.treenode.node_iter(): w.writerow(item._asdict()) elif kind == FileType.MSGPACK: # https://msgpack-python.readthedocs.io/en/latest/api.html with open(fn, "wb") as f: # Doesn't improve speed # msgpack.pack(self._to_dict(), f, use_bin_type=True) msgpack.pack(self.to_dict_list(), f) elif kind == FileType.JSON: self._json_dump(fn, json.dump) elif kind == FileType.UJSON: self._json_dump(fn, ujson.dump) elif kind == FileType.SIMPLEJSON: # NOTE: simplejson includes key names when serializing NamedTuples with open(fn, "w") as f: if self.json_dict_list: simplejson.dump(list(self.id_dict.values()), f, ensure_ascii=True) else: simplejson.dump(self.id_dict, f, ensure_ascii=True) elif kind == FileType.CBOR2: with open(fn, "wb") as f: cbor2.dump(self.to_dict_list(), f) elif kind == FileType.CBOR: with open(fn, "wb") as f: cbor.dump(self.to_dict_list(), f) elif kind == FileType.RAPIDJSON: # https://python-rapidjson.readthedocs.io/en/latest/benchmarks.html # TODO: See this example for possible speed improvement - deeper integration with Node # https://python-rapidjson.readthedocs.io/en/latest/encoder.html # NOTE: can't use id_dict - keys must be strings # can't use self.id_dict.values() - not serializable # list(self.id_dict.values()) produces a list of lists - no keys - very fragile with open(fn, "w") as f: if self.json_dict_list: rapidjson.Encoder(number_mode=rapidjson.NM_NATIVE, ensure_ascii=False)(self.to_dict_list(), f) else: rapidjson.Encoder(number_mode=rapidjson.NM_NATIVE, ensure_ascii=False)(list(self.id_dict.values()), f) elif kind == FileType.BSON: with open(fn, "wb") as f: co = CodecOptions(document_class=RawBSONDocument) for node in self.treenode.node_iter(): f.write(BSON.encode(node._asdict(), codec_options=co))
def main(): parser = argparse.ArgumentParser( 'process streamcorpus.Chunk files to generate CBOR files' ' to load into memex_dossier.akagraph.' ) parser.add_argument('input_paths', nargs='+', help='paths to streamcorpus.Chunk files') parser.add_argument('--output-path', help='cbor file (or cbor.gz) to create') parser.add_argument('--xform', action='store_true', default=False, help='run structured_features transform before page_extractors') parser.add_argument('--total', type=int, help='anticipated number of StreamItems') parser.add_argument('--limit', type=int, help='stop processing after this many StreamItems') args = parser.parse_args() xform = structured_features(structured_features.default_config) fopen = open if args.output_path.endswith('.gz'): fopen = gzip.open fh = fopen(args.output_path, 'wb') count = 0 start = time.time() for path in args.input_paths: for si in Chunk(path): count += 1 if count % 100 == 0: elapsed = time.time() - start rate = count / elapsed msg = '%d done in %.1f secs --> %.1f per sec' % (count, elapsed, rate) if args.total: remaining = (args.total - count) / rate msg += ' --> %.1f sec remaining' % remaining print(msg) sys.stdout.flush() if args.limit and count > args.limit: break #url_parts = urlparse(si.abs_url) if args.xform: si = xform(si) slots = profile_page(si) if slots: slots = cbor.loads(slots) better_slots = {} for key, values in slots['slots'].iteritems(): assert isinstance(values, list), values better_slots[key.lower()] = [unicodedata.normalize('NFKC', v).lower() for v in values] better_slots['url'] = si.abs_url cbor.dump(better_slots, fh) fh.close() print('done')
def _cbor_dump(self, fileo): with self.mutex: state = self.__getstate__() try: return cbor.dump(state, fileo) except: logger.error('could not cbor serialize state for spec %s', self.name, exc_info=True) raise
def mark_as_downloaded(self, link_key, obj_changes): link_as_file = self.make_link_into_file(link_key) if not os.path.exists(link_as_file): # add the data to the file link_data = { dimname: dimchange for dimname, dimchange in (obj_changes["dims"].iteritems()) if dimname is not "download_complete" } cbor.dump(link_data, open(link_as_file, "wb")) new_data = {"download_complete": {"type": Record.BOOL, "value": True}} if "error_reason" in obj_changes["dims"]: new_data["error_reason"] = (obj_changes["dims"]["error_reason"]) else: new_data["error_reason"] = {"type": Record.STRING, "value": ""} obj_changes["dims"] = new_data
def main(): def dflt_build_date(): v = os.environ.get("BUILD_DATE") if v: return dt_parse(v) else: return DT.now(tz=tzlocal()) parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-o", "--outfile", metavar="OUTFILE", type=argparse.FileType("wb"), required=True, help="Output file") parser.add_argument("--no-compress", action="store_true", help="Do not compress images") meta = parser.add_argument_group("Metadata") meta.add_argument("--desc", help="Description") meta.add_argument("-b", "--board", required=True, help="Board ID") meta.add_argument("-d", "--build-date", type=dt_parse, default=dflt_build_date(), help="Build date") meta_scm_excl = meta.add_mutually_exclusive_group(required=True) meta_scm_excl.add_argument("-vr", "--ver-rev", nargs=2, metavar=('VER', 'REV'), help="Version and SCM revision strings") meta_scm_excl.add_argument("--git-identity", action="store_true", help="Get revision and version from GIT SCM") # unfortunately it can't help parse flags like this: "-n abc.bin file.bin -a 0x00800000 file2.bin" to: # <group #1: name=abc.bin srcfile=file.bin> # <group #2: load_addr=0x00800000 srcfile=file2.bin> srcgroup = parser.add_argument_group("Source file arguments") srcgroup.add_argument("-n", "--name", action="append", help="Image name (e.g. firmware.bin)") srcgroup.add_argument("-a", "--load-addr", metavar="ADDR", type=lambda x: int(x, base=0), action="append", help="Image load address") srcgroup.add_argument("--dfu-alt", metavar="N", type=int, action="append", help="DFU Alternate setting") srcgroup.add_argument("infile", metavar="INFILE", type=argparse.FileType("rb"), nargs="+", help="Source file(s)") args = parser.parse_args() # I found bug in _cbor.so: don't encode Tag 0 (ver 0.1.25) # https://bitbucket.org/bodhisnarkva/cbor/issues/11/failed-to-encode-tag-0-invalid-negative if args.git_identity: p = subprocess.Popen(["git", "rev-parse", "HEAD"], stdout=subprocess.PIPE) rev_parse, err = p.communicate() p = subprocess.Popen(["git", "describe", "--always", "--dirty"], stdout=subprocess.PIPE) describe, err = p.communicate() ver, rev = describe.decode().strip(), rev_parse.decode().strip() else: ver, rev = args.ver_rev image_meta = { 'description': args.desc or '', 'build_date': cbor.Tag(0, args.build_date.isoformat()), 'version': ver, 'revision': rev, 'board': args.board, } images = {} def getindex(lst, idx, default=None): if lst and len(lst) > idx: return lst[idx] else: return default for i, src in enumerate(args.infile): name = getindex(args.name, i, basename(src.name)) addr = getindex(args.load_addr, i) dalt = getindex(args.dfu_alt, i) image = {} if addr is not None: image['load_address'] = addr if dalt is not None: image['dfu_alt'] = dalt with src as fd: buffer = bytes(fd.read()) hasher = hashlib.sha1() hasher.update(buffer) image['size'] = len(buffer) image['sha1sum'] = bytes(hasher.digest()) # less effective if data to random, but usual case use compressed deflated_buffer = bytes(zlib.compress(buffer, 9)) if len(buffer) > len(deflated_buffer) and not args.no_compress: # Tag: 'z' * 100 + 22 -> zipped, base64 repr image['image'] = cbor.Tag(12222, deflated_buffer) else: image['image'] = buffer images[name] = image with args.outfile as fd: # In version 1.0.0 Tag(0) bug fixed. cbor.dump( ("OTAFWv1", image_meta, images), fd)
def _write_block(self, blktype, **kwargs): LOG.debug('writing block: type=%s, content=%s', blktype, repr(kwargs)) cbor.dump(dict(blktype=blktype, **kwargs), self.fh)
def Dumps(self, dict_obj, stream_buf): cbor.dump(dict_obj, stream_buf)
# Parse the JSON string from the file into # a Python dictionary object data = json.load(fin) # Write the object to the file in BSON fout.write(bson.serialize_to_bytes(data)) # Close both files fin.close() fout.close() # Open a new file for writing out the CBOR encoding fout = open(fname + '.cbor', 'wb') # Use CBOR codec to write to cbor.dump(data, fout) # Close the CBOR file fout.close() # Open the BSON version in read-only mode, and a new file # for the roundtrip JSON output. fin = open(fname + '.bson', 'rb') fout = open(fname + '-roundtrip.json', 'w') # Parse the BSON file into a Python dictionary object data = bson.parse_stream(fin) # Dump the dictionary object out in JSON format json.dump(data, fout)
def write_msg_impl(self, msg): assert self._o_chunk_fh is not None cbor.dump(msg, self._o_chunk_fh)