def get_raw_strings(dataset, options): """ :param Dataset dataset: :param options: argparse.Namespace :return: list of (seq tag, string) :rtype: list[(str,str)] """ refs = [] start_time = time.time() seq_len_stats = Stats() seq_idx = options.startseq if options.endseq < 0: options.endseq = float("inf") interactive = Util.is_tty() and not log.verbose[5] print("Iterating over %r." % dataset, file=log.v2) while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= options.endseq: dataset.load_seqs(seq_idx, seq_idx + 1) complete_frac = dataset.get_complete_frac(seq_idx) start_elapsed = time.time() - start_time try: num_seqs_s = str(dataset.num_seqs) except NotImplementedError: try: num_seqs_s = "~%i" % dataset.estimated_num_seqs except TypeError: # a number is required, not NoneType num_seqs_s = "?" progress_prefix = "%i/%s" % (seq_idx, num_seqs_s,) progress = "%s (%.02f%%)" % (progress_prefix, complete_frac * 100) if complete_frac > 0: total_time_estimated = start_elapsed / complete_frac remaining_estimated = total_time_estimated - start_elapsed progress += " (%s)" % hms(remaining_estimated) seq_tag = dataset.get_tag(seq_idx) assert isinstance(seq_tag, str) ref = dataset.get_data(seq_idx, options.key) if isinstance(ref, numpy.ndarray): assert ref.shape == () or (ref.ndim == 1 and ref.dtype == numpy.uint8) if ref.shape == (): ref = ref.flatten()[0] # get the entry itself (str or bytes) else: ref = ref.tobytes() if isinstance(ref, bytes): ref = ref.decode("utf8") assert isinstance(ref, str) seq_len_stats.collect([len(ref)]) refs.append((seq_tag, ref)) if interactive: Util.progress_bar_with_time(complete_frac, prefix=progress_prefix) elif log.verbose[5]: print(progress_prefix, "seq tag %r, ref len %i chars" % (seq_tag, len(ref))) seq_idx += 1 print("Done. Num seqs %i. Total time %s." % ( seq_idx, hms(time.time() - start_time)), file=log.v1) print("More seqs which we did not dumped: %s." % ( dataset.is_less_than_num_seqs(seq_idx),), file=log.v1) seq_len_stats.dump(stream_prefix="Seq-length %r " % (options.key,), stream=log.v2) return refs
def get_raw_strings(dataset, options): """ :param Dataset dataset: :param options: argparse.Namespace :return: list of (seq tag, string) :rtype: list[(str,str)] """ refs = [] start_time = time.time() seq_len_stats = Stats() seq_idx = options.startseq if options.endseq < 0: options.endseq = float("inf") interactive = Util.is_tty() and not log.verbose[5] print("Iterating over %r." % dataset, file=log.v2) while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= options.endseq: dataset.load_seqs(seq_idx, seq_idx + 1) complete_frac = dataset.get_complete_frac(seq_idx) start_elapsed = time.time() - start_time try: num_seqs_s = str(dataset.num_seqs) except NotImplementedError: try: num_seqs_s = "~%i" % dataset.estimated_num_seqs except TypeError: # a number is required, not NoneType num_seqs_s = "?" progress_prefix = "%i/%s" % (seq_idx, num_seqs_s,) progress = "%s (%.02f%%)" % (progress_prefix, complete_frac * 100) if complete_frac > 0: total_time_estimated = start_elapsed / complete_frac remaining_estimated = total_time_estimated - start_elapsed progress += " (%s)" % hms(remaining_estimated) seq_tag = dataset.get_tag(seq_idx) assert isinstance(seq_tag, str) ref = dataset.get_data(seq_idx, options.key) if isinstance(ref, numpy.ndarray): assert ref.shape == () or (ref.ndim == 1 and ref.dtype == numpy.uint8) if ref.shape == (): ref = ref.flatten()[0] # get the entry itself (str or bytes) else: ref = ref.tobytes() if isinstance(ref, bytes): ref = ref.decode("utf8") assert isinstance(ref, str) seq_len_stats.collect([len(ref)]) refs.append((seq_tag, ref)) if interactive: Util.progress_bar_with_time(complete_frac, prefix=progress_prefix) elif log.verbose[5]: print(progress_prefix, "seq tag %r, ref len %i chars" % (seq_tag, len(ref))) seq_idx += 1 print("Done. Num seqs %i. Total time %s." % ( seq_idx, hms(time.time() - start_time)), file=log.v1) print("More seqs which we did not dumped: %s." % ( dataset.is_less_than_num_seqs(seq_idx),), file=log.v1) seq_len_stats.dump(stream_prefix="Seq-length %r " % (options.key,), stream=log.v2) return refs
def analyze_dataset(options): """ :param options: argparse.Namespace """ print("Epoch: %i" % options.epoch, file=log.v3) print("Dataset keys:", dataset.get_data_keys(), file=log.v3) print("Dataset target keys:", dataset.get_target_list(), file=log.v3) assert options.key in dataset.get_data_keys() terminal_width, _ = Util.terminal_size() show_interactive_process_bar = (log.verbose[3] and (not log.verbose[5]) and terminal_width >= 0) start_time = time.time() num_seqs_stats = Stats() if options.endseq < 0: options.endseq = float("inf") recurrent = True used_data_keys = dataset.get_data_keys() batch_size = config.typed_value('batch_size', 1) max_seqs = config.int('max_seqs', -1) seq_drop = config.float('seq_drop', 0.0) max_seq_length = config.typed_value('max_seq_length', None) or config.float('max_seq_length', 0) max_pad_size = config.typed_value("max_pad_size", None) batches = dataset.generate_batches( recurrent_net=recurrent, batch_size=batch_size, max_seqs=max_seqs, max_seq_length=max_seq_length, max_pad_size=max_pad_size, seq_drop=seq_drop, used_data_keys=used_data_keys) step = 0 total_num_seqs = 0 total_num_frames = NumbersDict() total_num_used_frames = NumbersDict() try: while batches.has_more(): # See FeedDictDataProvider. batch, = batches.peek_next_n(1) assert isinstance(batch, Batch) if batch.start_seq > options.endseq: break dataset.load_seqs(batch.start_seq, batch.end_seq) complete_frac = batches.completed_frac() start_elapsed = time.time() - start_time try: num_seqs_s = str(dataset.num_seqs) except NotImplementedError: try: num_seqs_s = "~%i" % dataset.estimated_num_seqs except TypeError: # a number is required, not NoneType num_seqs_s = "?" progress_prefix = "%i/%s" % (batch.start_seq, num_seqs_s) progress = "%s (%.02f%%)" % (progress_prefix, complete_frac * 100) if complete_frac > 0: total_time_estimated = start_elapsed / complete_frac remaining_estimated = total_time_estimated - start_elapsed progress += " (%s)" % hms(remaining_estimated) batch_max_time = NumbersDict.max([seq.frame_length for seq in batch.seqs]) * len(batch.seqs) batch_num_used_frames = sum([seq.frame_length for seq in batch.seqs], NumbersDict()) total_num_seqs += len(batch.seqs) num_seqs_stats.collect(numpy.array([len(batch.seqs)])) total_num_frames += batch_max_time total_num_used_frames += batch_num_used_frames print( "%s, batch %i, num seqs %i, frames %s, used %s (%s)" % ( progress, step, len(batch.seqs), batch_max_time, batch_num_used_frames, batch_num_used_frames / batch_max_time), file=log.v5) if show_interactive_process_bar: Util.progress_bar_with_time(complete_frac, prefix=progress_prefix) step += 1 batches.advance(1) finally: print("Done. Total time %s. More seqs which we did not dumped: %s" % ( hms(time.time() - start_time), batches.has_more()), file=log.v2) print("Dataset epoch %i, order %r." % (dataset.epoch, dataset.seq_ordering)) print("Num batches (steps): %i" % step, file=log.v1) print("Num seqs: %i" % total_num_seqs, file=log.v1) num_seqs_stats.dump(stream=log.v1, stream_prefix="Batch num seqs ") for key in used_data_keys: print("Data key %r:" % key, file=log.v1) print(" Num frames: %s" % total_num_frames[key], file=log.v1) print(" Num used frames: %s" % total_num_used_frames[key], file=log.v1) print(" Fraction used frames: %s" % (total_num_used_frames / total_num_frames)[key], file=log.v1) dataset.finish_epoch()
def dump_dataset(dataset, options): """ :type dataset: Dataset.Dataset :param options: argparse.Namespace """ print("Epoch: %i" % options.epoch, file=log.v3) dataset.init_seq_order(epoch=options.epoch) print("Dataset keys:", dataset.get_data_keys(), file=log.v3) print("Dataset target keys:", dataset.get_target_list(), file=log.v3) assert options.key in dataset.get_data_keys() if options.get_num_seqs: print("Get num seqs.") print("estimated_num_seqs: %r" % dataset.estimated_num_seqs) try: print("num_seqs: %r" % dataset.num_seqs) except Exception as exc: print("num_seqs exception %r, which is valid, so we count." % exc) seq_idx = 0 if dataset.get_target_list(): default_target = dataset.get_target_list()[0] else: default_target = None while dataset.is_less_than_num_seqs(seq_idx): dataset.load_seqs(seq_idx, seq_idx + 1) if seq_idx % 10000 == 0: if default_target: targets = dataset.get_targets(default_target, seq_idx) postfix = " (targets = %r...)" % (targets[:10],) else: postfix = "" print("%i ...%s" % (seq_idx, postfix)) seq_idx += 1 print("accumulated num seqs: %i" % seq_idx) print("Done.") return dump_file = None if options.type == "numpy": print("Dump files: %r*%r" % (options.dump_prefix, options.dump_postfix), file=log.v3) elif options.type == "stdout": print("Dump to stdout", file=log.v3) if options.stdout_limit is not None: Util.set_pretty_print_default_limit(options.stdout_limit) numpy.set_printoptions( threshold=sys.maxsize if options.stdout_limit == float("inf") else int(options.stdout_limit)) if options.stdout_as_bytes: Util.set_pretty_print_as_bytes(options.stdout_as_bytes) elif options.type == "print_tag": print("Dump seq tag to stdout", file=log.v3) elif options.type == "dump_tag": dump_file = open("%sseq-tags.txt" % options.dump_prefix, "w") print("Dump seq tag to file: %s" % (dump_file.name,), file=log.v3) elif options.type == "dump_seq_len": dump_file = open("%sseq-lens.txt" % options.dump_prefix, "w") print("Dump seq lens to file: %s" % (dump_file.name,), file=log.v3) dump_file.write("{\n") elif options.type == "print_shape": print("Dump shape to stdout", file=log.v3) elif options.type == "plot": print("Plot.", file=log.v3) elif options.type == "interactive": print("Interactive debug shell.", file=log.v3) elif options.type == "null": print("No dump.") else: raise Exception("unknown dump option type %r" % options.type) start_time = time.time() stats = Stats() if (options.stats or options.dump_stats) else None seq_len_stats = {key: Stats() for key in dataset.get_data_keys()} seq_idx = options.startseq if options.endseq < 0: options.endseq = float("inf") while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= options.endseq: dataset.load_seqs(seq_idx, seq_idx + 1) complete_frac = dataset.get_complete_frac(seq_idx) start_elapsed = time.time() - start_time try: num_seqs_s = str(dataset.num_seqs) except NotImplementedError: try: num_seqs_s = "~%i" % dataset.estimated_num_seqs except TypeError: # a number is required, not NoneType num_seqs_s = "?" progress_prefix = "%i/%s" % (seq_idx, num_seqs_s) progress = "%s (%.02f%%)" % (progress_prefix, complete_frac * 100) if complete_frac > 0: total_time_estimated = start_elapsed / complete_frac remaining_estimated = total_time_estimated - start_elapsed progress += " (%s)" % hms(remaining_estimated) if options.type == "print_tag": print("seq %s tag:" % (progress if log.verbose[2] else progress_prefix), dataset.get_tag(seq_idx)) elif options.type == "dump_tag": print("seq %s tag:" % (progress if log.verbose[2] else progress_prefix), dataset.get_tag(seq_idx)) dump_file.write("%s\n" % dataset.get_tag(seq_idx)) elif options.type == "dump_seq_len": seq_len = dataset.get_seq_length(seq_idx)[options.key] print( "seq %s tag:" % (progress if log.verbose[2] else progress_prefix), dataset.get_tag(seq_idx), "%r len:" % options.key, seq_len) dump_file.write("%r: %r,\n" % (dataset.get_tag(seq_idx), seq_len)) else: data = dataset.get_data(seq_idx, options.key) if options.type == "numpy": numpy.savetxt("%s%i.data%s" % (options.dump_prefix, seq_idx, options.dump_postfix), data) elif options.type == "stdout": print("seq %s tag:" % progress, dataset.get_tag(seq_idx)) print("seq %s data:" % progress, pretty_print(data)) elif options.type == "print_shape": print("seq %s data shape:" % progress, data.shape) elif options.type == "plot": plot(data) for target in dataset.get_target_list(): targets = dataset.get_targets(target, seq_idx) if options.type == "numpy": numpy.savetxt("%s%i.targets.%s%s" % (options.dump_prefix, seq_idx, target, options.dump_postfix), targets, fmt='%i') elif options.type == "stdout": extra = "" if target in dataset.labels and len(dataset.labels[target]) > 1: assert dataset.can_serialize_data(target) extra += " (%r)" % dataset.serialize_data(key=target, data=targets) print("seq %i target %r: %s%s" % (seq_idx, target, pretty_print(targets), extra)) elif options.type == "print_shape": print("seq %i target %r shape:" % (seq_idx, target), targets.shape) if options.type == "interactive": from Debug import debug_shell debug_shell(locals()) seq_len = dataset.get_seq_length(seq_idx) for key in dataset.get_data_keys(): seq_len_stats[key].collect([seq_len[key]]) if stats: stats.collect(data) if options.type == "null": Util.progress_bar_with_time(complete_frac, prefix=progress_prefix) seq_idx += 1 print("Done. Total time %s. More seqs which we did not dumped: %s" % ( hms_fraction(time.time() - start_time), dataset.is_less_than_num_seqs(seq_idx)), file=log.v2) for key in dataset.get_data_keys(): seq_len_stats[key].dump(stream_prefix="Seq-length %r " % key, stream=log.v2) if stats: stats.dump(output_file_prefix=options.dump_stats, stream_prefix="Data %r " % options.key, stream=log.v1) if options.type == "dump_seq_len": dump_file.write("}\n") if dump_file: print("Dumped to file:", dump_file.name, file=log.v2) dump_file.close()
def dump_dataset(dataset, options): """ :type dataset: Dataset.Dataset :param options: argparse.Namespace """ print("Epoch: %i" % options.epoch, file=log.v3) dataset.init_seq_order(epoch=options.epoch) print("Dataset keys:", dataset.get_data_keys(), file=log.v3) print("Dataset target keys:", dataset.get_target_list(), file=log.v3) assert options.key in dataset.get_data_keys() if options.get_num_seqs: print("Get num seqs.") print("estimated_num_seqs: %r" % dataset.estimated_num_seqs) try: print("num_seqs: %r" % dataset.num_seqs) except Exception as exc: print("num_seqs exception %r, which is valid, so we count." % exc) seq_idx = 0 if dataset.get_target_list(): default_target = dataset.get_target_list()[0] else: default_target = None while dataset.is_less_than_num_seqs(seq_idx): dataset.load_seqs(seq_idx, seq_idx + 1) if seq_idx % 10000 == 0: if default_target: targets = dataset.get_targets(default_target, seq_idx) postfix = " (targets = %r...)" % (targets[:10], ) else: postfix = "" print("%i ...%s" % (seq_idx, postfix)) seq_idx += 1 print("accumulated num seqs: %i" % seq_idx) print("Done.") return if options.type == "numpy": print("Dump files: %r*%r" % (options.dump_prefix, options.dump_postfix), file=log.v3) elif options.type == "stdout": print("Dump to stdout", file=log.v3) elif options.type == "print_shape": print("Dump shape to stdout", file=log.v3) elif options.type == "plot": print("Plot.", file=log.v3) elif options.type == "null": print("No dump.") else: raise Exception("unknown dump option type %r" % options.type) start_time = time.time() stats = Stats() if (options.stats or options.dump_stats) else None seq_len_stats = {key: Stats() for key in dataset.get_data_keys()} seq_idx = options.startseq if options.endseq < 0: options.endseq = float("inf") while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= options.endseq: dataset.load_seqs(seq_idx, seq_idx + 1) complete_frac = dataset.get_complete_frac(seq_idx) start_elapsed = time.time() - start_time try: num_seqs_s = str(dataset.num_seqs) except NotImplementedError: try: num_seqs_s = "~%i" % dataset.estimated_num_seqs except TypeError: # a number is required, not NoneType num_seqs_s = "?" progress_prefix = "%i/%s" % (seq_idx, num_seqs_s) progress = "%s (%.02f%%)" % (progress_prefix, complete_frac * 100) if complete_frac > 0: total_time_estimated = start_elapsed / complete_frac remaining_estimated = total_time_estimated - start_elapsed progress += " (%s)" % hms(remaining_estimated) data = dataset.get_data(seq_idx, options.key) if options.type == "numpy": numpy.savetxt( "%s%i.data%s" % (options.dump_prefix, seq_idx, options.dump_postfix), data) elif options.type == "stdout": print("seq %s data:" % progress, pretty_print(data)) elif options.type == "print_shape": print("seq %s data shape:" % progress, data.shape) elif options.type == "plot": plot(data) for target in dataset.get_target_list(): targets = dataset.get_targets(target, seq_idx) if options.type == "numpy": numpy.savetxt("%s%i.targets.%s%s" % (options.dump_prefix, seq_idx, target, options.dump_postfix), targets, fmt='%i') elif options.type == "stdout": print("seq %i target %r:" % (seq_idx, target), pretty_print(targets)) elif options.type == "print_shape": print("seq %i target %r shape:" % (seq_idx, target), targets.shape) seq_len = dataset.get_seq_length(seq_idx) for key in dataset.get_data_keys(): seq_len_stats[key].collect([seq_len[key]]) if stats: stats.collect(data) if options.type == "null": Util.progress_bar_with_time(complete_frac, prefix=progress_prefix) seq_idx += 1 print("Done. Total time %s. More seqs which we did not dumped: %s" % (hms(time.time() - start_time), dataset.is_less_than_num_seqs(seq_idx)), file=log.v1) for key in dataset.get_data_keys(): seq_len_stats[key].dump(stream_prefix="Seq-length %r " % key, stream=log.v2) if stats: stats.dump(output_file_prefix=options.dump_stats, stream_prefix="Data %r " % options.key, stream=log.v2)
def dump_dataset(dataset, options): """ :type dataset: Dataset.Dataset :param options: argparse.Namespace """ print("Epoch: %i" % options.epoch, file=log.v3) dataset.init_seq_order(epoch=options.epoch) print("Dataset keys:", dataset.get_data_keys(), file=log.v3) print("Dataset target keys:", dataset.get_target_list(), file=log.v3) assert options.key in dataset.get_data_keys() if options.get_num_seqs: print("Get num seqs.") print("estimated_num_seqs: %r" % dataset.estimated_num_seqs) try: print("num_seqs: %r" % dataset.num_seqs) except Exception as exc: print("num_seqs exception %r, which is valid, so we count." % exc) seq_idx = 0 if dataset.get_target_list(): default_target = dataset.get_target_list()[0] else: default_target = None while dataset.is_less_than_num_seqs(seq_idx): dataset.load_seqs(seq_idx, seq_idx + 1) if seq_idx % 10000 == 0: if default_target: targets = dataset.get_targets(default_target, seq_idx) postfix = " (targets = %r...)" % (targets[:10],) else: postfix = "" print("%i ...%s" % (seq_idx, postfix)) seq_idx += 1 print("accumulated num seqs: %i" % seq_idx) print("Done.") return if options.type == "numpy": print("Dump files: %r*%r" % (options.dump_prefix, options.dump_postfix), file=log.v3) elif options.type == "stdout": print("Dump to stdout", file=log.v3) if options.stdout_limit is not None: Util.set_pretty_print_default_limit(options.stdout_limit) numpy.set_printoptions( threshold=sys.maxsize if options.stdout_limit == float("inf") else int(options.stdout_limit)) if options.stdout_as_bytes: Util.set_pretty_print_as_bytes(options.stdout_as_bytes) elif options.type == "print_tag": print("Dump seq tag to stdout", file=log.v3) elif options.type == "print_shape": print("Dump shape to stdout", file=log.v3) elif options.type == "plot": print("Plot.", file=log.v3) elif options.type == "null": print("No dump.") else: raise Exception("unknown dump option type %r" % options.type) start_time = time.time() stats = Stats() if (options.stats or options.dump_stats) else None seq_len_stats = {key: Stats() for key in dataset.get_data_keys()} seq_idx = options.startseq if options.endseq < 0: options.endseq = float("inf") while dataset.is_less_than_num_seqs(seq_idx) and seq_idx <= options.endseq: dataset.load_seqs(seq_idx, seq_idx + 1) complete_frac = dataset.get_complete_frac(seq_idx) start_elapsed = time.time() - start_time try: num_seqs_s = str(dataset.num_seqs) except NotImplementedError: try: num_seqs_s = "~%i" % dataset.estimated_num_seqs except TypeError: # a number is required, not NoneType num_seqs_s = "?" progress_prefix = "%i/%s" % (seq_idx, num_seqs_s) progress = "%s (%.02f%%)" % (progress_prefix, complete_frac * 100) if complete_frac > 0: total_time_estimated = start_elapsed / complete_frac remaining_estimated = total_time_estimated - start_elapsed progress += " (%s)" % hms(remaining_estimated) if options.type == "print_tag": print("seq %s tag:" % (progress if log.verbose[2] else progress_prefix), dataset.get_tag(seq_idx)) else: data = dataset.get_data(seq_idx, options.key) if options.type == "numpy": numpy.savetxt("%s%i.data%s" % (options.dump_prefix, seq_idx, options.dump_postfix), data) elif options.type == "stdout": print("seq %s tag:" % progress, dataset.get_tag(seq_idx)) print("seq %s data:" % progress, pretty_print(data)) elif options.type == "print_shape": print("seq %s data shape:" % progress, data.shape) elif options.type == "plot": plot(data) for target in dataset.get_target_list(): targets = dataset.get_targets(target, seq_idx) if options.type == "numpy": numpy.savetxt("%s%i.targets.%s%s" % (options.dump_prefix, seq_idx, target, options.dump_postfix), targets, fmt='%i') elif options.type == "stdout": extra = "" if target in dataset.labels and len(dataset.labels[target]) > 1: labels = dataset.labels[target] if len(labels) < 1000 and all([len(l) == 1 for l in labels]): join_str = "" else: join_str = " " extra += " (%r)" % join_str.join(map(dataset.labels[target].__getitem__, targets)) print("seq %i target %r: %s%s" % (seq_idx, target, pretty_print(targets), extra)) elif options.type == "print_shape": print("seq %i target %r shape:" % (seq_idx, target), targets.shape) seq_len = dataset.get_seq_length(seq_idx) for key in dataset.get_data_keys(): seq_len_stats[key].collect([seq_len[key]]) if stats: stats.collect(data) if options.type == "null": Util.progress_bar_with_time(complete_frac, prefix=progress_prefix) seq_idx += 1 print("Done. Total time %s. More seqs which we did not dumped: %s" % ( hms(time.time() - start_time), dataset.is_less_than_num_seqs(seq_idx)), file=log.v2) for key in dataset.get_data_keys(): seq_len_stats[key].dump(stream_prefix="Seq-length %r " % key, stream=log.v2) if stats: stats.dump(output_file_prefix=options.dump_stats, stream_prefix="Data %r " % options.key, stream=log.v1)