def udpipe(sentences, model_name, verbose=False): """ Parse text to Universal Dependencies using UDPipe. :param sentences: iterable of iterables of strings (one string per line) :param model_name: filename containing UDPipe model to load :param verbose: print extra information :return: iterable of lines containing parsed output """ from ufal.udpipe import Model, Pipeline, ProcessingError model = Model.load(model_name) if not model: raise ValueError("Invalid model: '%s'" % model_name) pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") lines1, lines2 = tee(l for s in sentences for l in s) text = "\n".join(lines1) error = ProcessingError() num_tokens = sum(1 for l in lines2 if l) with tqdm.external_write_mode(): print("Running %s on %d tokens... " % (model_name, num_tokens), end="", flush=True) start = time() processed = pipeline.process(text, error) duration = time() - start with tqdm.external_write_mode(): print("Done (%.3fs, %.0f tokens/s)" % (duration, num_tokens / duration if duration else 0)) if verbose: print(processed) if error.occurred(): raise RuntimeError(error.message) return processed.splitlines()
def evaluate_all(args, evaluate, files, name=None): for ((guessed_converted, guessed_passage, _, guessed_format, guessed_converter, _), (ref_converted, ref_passage, passage_id, ref_format, _, ref_converter)) in \ tqdm(zip(*[read_files(f, args.format, verbose=args.verbose) for f in files]), unit=" passages", desc=name, total=len(files[-1])): if not args.quiet: with tqdm.external_write_mode(): print(passage_id, end=" ") if guessed_format != ref_format: guessed_passage = next(iter(guessed_converter(guessed_passage + [""], passage_id=passage_id))) if \ ref_converter is None else ref_converter(guessed_converted) result = evaluate(guessed_passage, ref_passage, verbose=args.verbose > 1 or args.units, units=args.units, errors=args.errors) if not args.quiet: with tqdm.external_write_mode(): print( "F1: %.3f" % result.average_f1(UNLABELED if args.unlabeled else LABELED) ) if args.verbose: with tqdm.external_write_mode(): result.print() yield result
def copy_splits(src_supervoxels, src_info, dest_info): logger.info("Reading source supervoxel sizes...") src_sv_sizes = [] for src_sv in tqdm(src_supervoxels): src_sv_sizes.append(fetch_body_size(src_info, src_sv, supervoxels=True)) sv_size_df = pd.DataFrame({ 'src_sv': src_supervoxels, 'src_sv_size': src_sv_sizes }) sv_size_df.sort_values(['src_sv_size'], inplace=True) logger.info("Copying supervoxels from smallest -> largest") copy_infos = [] for src_sv, src_sv_size in tqdm(sv_size_df.itertuples(index=False), total=len(sv_size_df)): try: first_coord_zyx = generate_sample_coordinate(src_info, src_sv, supervoxels=True) dest_sv = fetch_label_for_coordinate(dest_info, first_coord_zyx, supervoxels=True) dest_sv_size = fetch_body_size(dest_info, dest_sv, supervoxels=True) if src_sv_size == dest_sv_size: with tqdm.external_write_mode(): logger.info( f"SV {src_sv} appears to be already copied at the destination, where it has ID {dest_sv}. Skipping." ) split_info = SplitCopyInfo(src_sv, dest_sv, dest_sv, 0) elif src_sv_size > dest_sv_size: with tqdm.external_write_mode(): logger.error( f"Refusing to copy SV {src_sv}: It is too big for the destination supervoxel (SV {dest_sv})!" ) split_info = SplitCopyInfo(src_sv, dest_sv, 0, 0) else: # Fetch RLEs and apply to destination rle_payload = fetch_sparsevol_rles(src_info, src_sv, supervoxels=True) rle_size, first_coord_zyx = extract_rle_size_and_first_coord( rle_payload) assert rle_size == src_sv_size split_sv, remain_sv = split_supervoxel(dest_info, dest_sv, rle_payload) split_info = SplitCopyInfo(src_sv, dest_sv, split_sv, remain_sv) except Exception as ex: with tqdm.external_write_mode(): logger.error(f"Error copying SV {src_sv}: {ex}") split_info = SplitCopyInfo(src_sv, 0, 0, 0) copy_infos.append(split_info) return copy_infos
def evaluate_all(evaluate, files, name=None, verbose=0, quiet=False, basename=False, matching_ids=False, units=False, errors=False, unlabeled=False, **kwargs): guessed, ref = [iter(read_files(f, kwargs["format"], verbose=verbose, force_basename=basename)) for f in files] for (g, r) in tqdm(zip(guessed, ref), unit=" passages", desc=name, total=len(files[-1])): if matching_ids: while g.ID < r.ID: g = next(guessed) while g.ID > r.ID: r = next(ref) if not quiet: with tqdm.external_write_mode(): print(r.ID, end=" ") if g.format != r.format: # noinspection PyCallingNonCallable g.passage = next(iter(g.in_converter(g.passage + [""], passage_id=r.ID))) if \ r.out_converter is None else r.out_converter(g.converted) result = evaluate(g.passage, r.passage, verbose=verbose > 1 or units, units=units, errors=errors, eval_type=UNLABELED if unlabeled else None) if not quiet: with tqdm.external_write_mode(): print("F1: %.3f" % result.average_f1(UNLABELED if unlabeled else LABELED)) if verbose: with tqdm.external_write_mode(): result.print() yield result
def copy_meshes(info_from, info_to, tarball_type, body_ids, parallelism=1, error_mode='fail'): pool = multiprocessing.Pool(parallelism) tasks = map( lambda body_id: pool.apply_async(copy_tarballs_for_body, ( info_from, info_to, tarball_type, body_id)), body_ids) ids_and_tasks = zip(body_ids, tasks) # By evaluating the above map(), this immediately starts distributing tasks to the pool ids_and_tasks = list(ids_and_tasks) # Iterate over the 'results' in the queue. # If any failed in a worker process, the exception will be re-raised here upon calling get(), below. for i, (body_id, task) in enumerate(tqdm(ids_and_tasks)): try: task.get( ) # Ensure copy is complete; catch any pickled exception now except requests.RequestException: with tqdm.external_write_mode(): logger.error( f"Error copying body {body_id} (mesh #{i} in the list)") if error_mode == 'fail': # Note: Since we're using a pool, it's possible that some meshes # after this one have already successfully copied, # but at least all meshes before this one have definitely succeeded. raise
def copy_annotation(passages, conllu, as_array=True, verbose=False): if not as_array: raise ValueError("Annotating with CoNLL-U files and as_array=False are currently not supported; use --as-array") for passage, annotated in zip(passages, read_files_and_dirs(conllu, converters=CONVERTERS)): if verbose: with tqdm.external_write_mode(): print("Reading annotation from '%s'" % annotated.ID) passage.layer(layer0.LAYER_ID).docs()[:] = annotated.layer(layer0.LAYER_ID).docs() yield passage
def _echo_with_tqdm_lock(message: str) -> None: """Makes sure that message printing (echoing) will be not in conflict with tqdm. It may happen that progressbar conflicts with extra printing. Nothing very serious happens then, except that there is printed (not removed) progressbar line. The `external_write_mode` allows to disable tqdm for writing time. """ with tqdm.external_write_mode(): click.echo(message=message, color=cfg.get("color"))
def main(args): print( "id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont," "edges,primary,remote,linkage,parents,children,mult-parents") data = [] for passage in get_passages_with_progress_bar(args.filenames): terminals = passage.layer(layer0.LAYER_ID).all non_terminals = [ n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1" ] non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage] linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages edges = {e for n in non_terminals for e in n} remote = [e for e in edges if e.attrib.get("remote")] linkage_edges = [e for n in linkage_nodes for e in n] fields = ( int(passage.ID), 1, len({t.paragraph for t in terminals}), len(break2sentences(passage)), len(terminals) + len(non_terminals), len(terminals), len(non_terminals), len([n for n in non_linkage if n.attrib.get("implicit")]), len(linkage_nodes), len([ n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous ]), len(edges), len(edges) - len(remote) - len(linkage_edges), len(remote), len(linkage_edges), sum( len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage), sum(len(n.children) for n in non_linkage), len([ n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1 ]), ) if not args.summary: with tqdm.external_write_mode(): print(",".join("%d" % f for f in fields)) data.append(fields) data = np.array(data, dtype=int) if args.outfile: np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t") if args.summary: print(",".join("%d" % f for f in data.sum(axis=0)))
def write_passage(passage, output_format=None, binary=False, outdir=".", prefix="", converter=None, verbose=True): suffix = output_format if output_format and output_format != "ucca" else ("pickle" if binary else "xml") outfile = outdir + os.path.sep + prefix + passage.ID + "." + suffix if verbose: with tqdm.external_write_mode(): print("Writing passage '%s'..." % outfile) if output_format is None or output_format in ("ucca", "pickle", "xml"): passage2file(passage, outfile, binary=binary) else: output = "\n".join(line for line in (converter or to_text)(passage)) with open(outfile, "w", encoding="utf-8") as f: f.write(output + "\n") return outfile
def to_file(packets, f, format): """Write packets to f as format.""" if format == 'auto': format = 'debug' if f.isatty() else 'bytes' if f.isatty(): for p in packets: with tqdm.external_write_mode(): f.write(getattr(p, format)) yield p else: for p in packets: f.write(getattr(p, format)) yield p
def register_artifact(self, binary, artifact: IArtifact): if getattr(artifact, 'already_registered', None): with tqdm.external_write_mode(nolock=True): self.config.logger.info( "{} '{}' already registered, ignoring.".format( artifact.get_pretty_type(), artifact.get_name())) return try: self.config.api.upload_artifact(binary, artifact) except ApiError as e: if e.message and 'already exists' in e.message: raise ApiError( "{} '{}' at version {} has already been registered and cannot be " "overwritten.".format(artifact.get_pretty_type(), artifact.get_name(), artifact.get_version())) else: raise e with tqdm.external_write_mode(nolock=True): self.config.logger.info("{} '{}' registered.".format( artifact.get_pretty_type(), artifact.get_name()))
def read_files(files, default_format=None, verbose=0, force_basename=False): for filename in sorted(files, key=lambda x: tuple(map(int, re.findall("\d+", x))) or x): basename, converted_format = passage_format(filename) in_converter, out_converter = CONVERTERS.get(converted_format, CONVERTERS[default_format]) kwargs = dict(converted_format=converted_format, in_converter=in_converter, out_converter=out_converter) if in_converter: with open(filename, encoding="utf-8") as f: for converted, passage, passage_id in in_converter(f, passage_id=basename, return_original=True): if verbose: with tqdm.external_write_mode(): print("Converting %s from %s" % (filename, converted_format)) yield ConvertedPassage(converted, passage, basename if force_basename else passage_id, **kwargs) else: passage_id = basename if force_basename else None yield ConvertedPassage(ioutil.file2passage(filename), passage_id=passage_id, **kwargs)
def main(args): order = None if args.sentences: with open(args.sentences, encoding="utf-8") as f: order = dict(map(reversed, enumerate(map(str.strip, f)))) for passage in get_passages_with_progress_bar(args.filenames, "Splitting"): for sentence in split(passage, order) if order else split2sentences( passage, remarks=args.remarks, lang=args.lang): outfile = os.path.join( args.outdir, args.prefix + sentence.ID + (".pickle" if args.binary else ".xml")) with tqdm.external_write_mode(): print("Writing passage file for sentence '%s'..." % outfile, file=sys.stderr) passage2file(sentence, outfile, args.binary)
def main(args): for passage in get_passages_with_progress_bar(args.passages): extracted = constructions.extract_edges( passage, constructions=args.constructions, verbose=args.verbose) if any(extracted.values()): with tqdm.external_write_mode(): if not args.verbose: print("%s:" % passage.ID) for construction, edges in extracted.items(): if edges: print(" %s:" % construction.description) for edge in edges: print(" %s [%s %s]" % (edge, edge.tag, edge.child)) print()
def apply_split(rle_payload_bytes, server, uuid, instance, verbose=False): rles = np.frombuffer(rle_payload_bytes, dtype=np.uint32)[3:] rles = rles.reshape(-1, 4) first_coord_xyz = rles[0, :3] first_coord_zyx = first_coord_xyz[::-1] supervoxel = fetch_label_for_coordinate(server, uuid, instance, first_coord_zyx, supervoxels=True) voxel_count = rles[:, 3].sum() if verbose: with tqdm.external_write_mode(): logger.info( f"Applying split to {supervoxel} ({voxel_count} voxels)") split_supervoxel(server, uuid, instance, supervoxel, rle_payload_bytes)
def write_passage(passage, args): ext = { None: UCCA_EXT[args.binary], "amr": ".txt" }.get(args.output_format) or "." + args.output_format outfile = args.out_dir + os.path.sep + passage.ID + ext if args.verbose: with tqdm.external_write_mode(): print("Writing '%s'..." % outfile, file=sys.stderr) if args.output_format is None: # UCCA output ioutil.passage2file(passage, outfile, binary=args.binary) else: converter = CONVERTERS[args.output_format][1] output = "\n".join(converter(passage)) if args.output_format == "amr" else \ "\n".join(line for p in (split2sentences(passage) if args.split else [passage]) for line in converter(p, test=args.test, tree=args.tree, mark_aux=args.mark_aux)) with open(outfile, "w", encoding="utf-8") as f: print(output, file=f)
def _next_passage(self): passage = None if self._split_iter is None: try: file = next(self._files_iter) except StopIteration: # Finished iteration raise if isinstance(file, Passage): # Not really a file, but a Passage passage = file else: # A file attempts = 3 while not os.path.exists(file): with tqdm.external_write_mode(file=sys.stderr): if attempts == 0: print("File not found: %s" % file, file=sys.stderr) return next(self) print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr) time.sleep(5) attempts -= 1 try: passage = file2passage(file) # XML or binary format except (IOError, ParseError): # Failed to read as passage file base, ext = os.path.splitext(os.path.basename(file)) converter = self.converters[ext.lstrip(".")] self._file_handle = open(file, encoding="utf-8") self._split_iter = iter(converter(self._file_handle, passage_id=base, lang=self.lang)) if self.split: if self._split_iter is None: self._split_iter = (passage,) self._split_iter = iter(s for p in self._split_iter for s in split2segments(p, is_sentences=self.sentences, lang=self.lang)) if self._split_iter is not None: # Either set before or initialized now try: # noinspection PyTypeChecker passage = next(self._split_iter) except StopIteration: # Finished this converter self._split_iter = None if self._file_handle is not None: self._file_handle.close() self._file_handle = None return next(self) return passage
def main(args): converter1 = convert.TO_FORMAT[args.format] converter2 = convert.FROM_FORMAT[args.format] scores = [] for ref in get_passages_with_progress_bar(args.filenames, desc="Converting"): try: guessed = next(converter2(converter1(ref, tree=args.tree), ref.ID)) scores.append(evaluate(guessed, ref, verbose=args.verbose)) except Exception as e: if args.strict: raise ValueError("Error evaluating conversion of %s" % ref.ID) from e else: with tqdm.external_write_mode(): print("Error evaluating conversion of %s: %s" % (ref.ID, e), file=sys.stderr) print() if args.verbose and len(scores) > 1: print("Aggregated scores:") Scores.aggregate(scores).print()
def apply_splits_from_saved_rles(kafka_log, server, uuid, labelmap_instance, verbose=False): new_ids = parse_new_ids(kafka_log) with Timer(f"Applying {len(new_ids)} split RLEs"): for new_id in tqdm(new_ids): rle_path = f'{new_id}.rle' if not os.path.exists(rle_path): raise RuntimeError(f"Can't find .rle file: {rle_path}") if verbose: with tqdm.external_write_mode(): logger.info(f"Loading {rle_path}") with open(rle_path, 'rb') as f: rle_payload = f.read() if len(rle_payload) == 0: logger.error(f"Error: {rle_path} has no content!") continue apply_split(rle_payload, server, uuid, labelmap_instance, verbose)
def get_nlp(lang="en"): instance = nlp.get(lang) if instance is None: import spacy model = models.get(lang) if not model: models[lang] = model = os.environ.get("_".join((MODEL_ENV_VAR, lang.upper()))) or \ os.environ.get(MODEL_ENV_VAR) or DEFAULT_MODEL.get(lang, "xx") started = time.time() with tqdm.external_write_mode(): print("Loading spaCy model '%s'... " % model, end="", flush=True) try: nlp[lang] = instance = spacy.load(model) except OSError: spacy.cli.download(model) try: nlp[lang] = instance = spacy.load(model) except OSError as e: raise OSError("Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e tokenizer[lang] = instance.tokenizer instance.tokenizer = lambda words: spacy.tokens.Doc(instance.vocab, words=words) print("Done (%.3fs)." % (time.time() - started)) return instance
def external_write_mode(*args, **kwargs): try: with tqdm.external_write_mode(*args, **kwargs): yield except AttributeError: yield
group = argparser.add_mutually_exclusive_group() group.add_argument("--no-normalize", action="store_false", dest="normalize", help="do not normalize passage") group.add_argument("-e", "--extra-normalization", action="store_true", help="more normalization rules") argparser.add_argument("--label-map", help="CSV file specifying mapping of input edge labels to output edge labels") argparser.add_argument("-i", "--node-ids", action="store_true", help="print tikz code rather than showing plots") args = argparser.parse_args() if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) for passage in get_passages_with_progress_bar(args.passages, desc="Visualizing", converters=FROM_FORMAT): map_labels(passage, args.label_map) if args.normalize: normalize(passage, extra=args.extra_normalization) if args.tikz: tikz = visualization.tikz(passage, node_ids=args.node_ids) if args.out_dir: with open(os.path.join(args.out_dir, passage.ID + ".tikz.txt"), "w") as f: print(tikz, file=f) else: with tqdm.external_write_mode(): print(tikz) else: plt.figure(figsize=(19, 10)) visualization.draw(passage, node_ids=args.node_ids) if args.out_dir: plt.savefig(os.path.join(args.out_dir, passage.ID + ".png")) else: mng = plt.get_current_fig_manager() mng.full_screen_toggle() plt.show()
def write(self, message: str) -> None: """Write message to stdout.""" with tqdm.external_write_mode(): click.echo(message=message, color=self.config.get("color"))
def tqdm_external_write_mode(file=None, nolock=False): """tqdm中に何か出力したいとき用のやつ。""" from tqdm import tqdm as t return t.external_write_mode(file=file, nolock=nolock)
def _train_multi( self, train_data: TimeSeriesDataFrame, hyperparameters: Optional[Union[str, Dict]] = None, models: Optional[List[AbstractTimeSeriesModel]] = None, val_data: Optional[TimeSeriesDataFrame] = None, hyperparameter_tune: bool = False, time_limit: Optional[float] = None, ) -> List[str]: logger.info( f"\nStarting training. Start time is {time.strftime('%Y-%m-%d %H:%M:%S')}" ) time_start = time.time() if hyperparameters is not None: hyperparameters = copy.deepcopy(hyperparameters) else: if models is None: raise ValueError( "Either models or hyperparameters should be provided") if self.save_data and not self.is_data_saved: self.save_train_data(train_data) if val_data is not None: self.save_val_data(val_data) self.is_data_saved = True if models is None: models = self.construct_model_templates( hyperparameters=hyperparameters, hyperparameter_tune=hyperparameter_tune, freq=train_data.freq, ) logger.info( f"Models that will be trained: {list(m.name for m in models)}") time_limit_model_split = time_limit if time_limit is not None and len(models) > 0: time_limit_model_split /= len(models) model_names_trained = [] for i, model in enumerate(models): if hyperparameter_tune: time_left = time_limit_model_split fit_log_message = f"Hyperparameter tuning model: {model.name}. " if time_limit is not None and time_limit_model_split is not None: fit_log_message += ( f"Tuning model for up to {time_limit_model_split:.2f}s " f"of the {time_limit:.2f}s remaining.") logger.info(fit_log_message) with tqdm.external_write_mode(): model_names_trained += self.tune_model_hyperparameters( model, time_limit=time_left, train_data=train_data, val_data=val_data, ) else: time_left = None fit_log_message = f"Training timeseries model {model.name}. " if time_limit is not None: time_start_model = time.time() time_left = time_limit - (time_start_model - time_start) if time_left <= 0: logger.info( f"Stopping training due to lack of time remaining. Time left: {time_left:.2f} seconds" ) break fit_log_message += ( f"Training for up to {time_left:.2f}s of " f"the {time_left:.2f}s of remaining time.") logger.info(fit_log_message) model_names_trained += self._train_and_save( train_data, model=model, val_data=val_data, time_limit=time_left) if self.enable_ensemble: try: model_names_trained.append( self.fit_ensemble(val_data=val_data, model_names=model_names_trained)) except Exception as e: # noqa logger.error( f"\tEnsemble training failed with error \n{traceback.format_exc()}." ) logger.info( f"Training complete. Models trained: {model_names_trained}") logger.info(f"Total runtime: {time.time() - time_start:.2f} s") try: best_model = self.get_model_best() logger.info(f"Best model: {best_model}") logger.info( f"Best model score: {self.get_model_attribute(best_model, 'val_score'):.4f}" ) except ValueError as e: logger.error(str(e)) return model_names_trained
def _log(s: str, req_url: str, **kwargs) -> None: with tqdm.external_write_mode(file=sys.stdout): click.secho(f'[{req_url}] {s}', **kwargs)
def println(self, *args, verbose=0, **kwargs): if self.is_main_process and self.args.verbose >= verbose: with tqdm.external_write_mode(): print(*args, **kwargs)