def plot_speech_features(feature_matrix, file_name=None, vertical=True, ax=None, length=8.0, dpi=100): """Plot speech feature matrix. Args: feature_matrix: a two-dimensional numpy array of values between zero and one, where rows correspond to source words, and columns correspond to target words file_name: the name of the file to which we write the attention; if not given, the plt context will be left un-closed vertical: if True, the time dimension will be projected onto the y axis, otherwise the x axis ax: if given, draw on this matplotlib axis; otherwise create a new figure length: figure length (if ax is not given) dpi: plot resolution """ if not ax: plt.subplots(figsize=(1.0, length)) if vertical: feature_matrix = feature_matrix[:, ::-1].T if ax: ax.pcolor(feature_matrix, cmap=plt.cm.jet, vmin=-1, vmax=1) ax.axis('off') else: plt.pcolor(feature_matrix, cmap=plt.cm.jet, vmin=-1, vmax=1) plt.axis('off') if file_name is not None: utils.make_parent_dir(file_name) plt.savefig(file_name, dpi=dpi) plt.close()
def set_out_file(out_file, exp_name): """ Set the file to log to. Before calling this, logs are only passed to stdout/stderr. Args: out_file: file name exp_name: name of experiment """ unset_out_file() utils.make_parent_dir(out_file) with open(out_file, mode="w") as f_out: for line in _preamble_content: f_out.write(f"{line}\n") fh = logging.FileHandler(out_file, encoding="utf-8") fh.setLevel(settings.LOG_LEVEL_FILE) fh.setFormatter(MainFormatter()) logger.addHandler(fh) logger_file.addHandler(fh) yaml_fh = logging.FileHandler(f"{out_file}.yaml", mode='w', encoding="utf-8") yaml_fh.setLevel(logging.DEBUG) yaml_fh.setFormatter(YamlFormatter()) yaml_fh.setLevel(logging.DEBUG) yaml_logger.addHandler(yaml_fh) tensorboard_writer.set_out_file(f"{out_file}.tb", exp_name=exp_name)
def conclude_report(self) -> None: if self.hyp_sents: ref_filename = os.path.join(self.report_path, "tmp", "compare-mt.ref") out_filename = os.path.join(self.report_path, "tmp", "compare-mt.out") utils.make_parent_dir(out_filename) with open(ref_filename, "w") as fout: for l in self.ref_sents: fout.write(f"{l.strip()}\n") with open(out_filename, "w") as fout: for l in self.hyp_sents: fout.write(f"{l.strip()}\n") import xnmt.thirdparty.comparemt.compare_mt as compare_mt args = utils.ArgClass(ref_file=ref_filename, out_file=out_filename, out2_file=self.out2_file, train_file=self.train_file, train_counts=self.train_counts, alpha=self.alpha, ngram=self.ngram, ngram_size=self.ngram_size, sent_size=self.sent_size) out_lines = compare_mt.main(args) report_filename = os.path.join(self.report_path, "compare-mt.txt") utils.make_parent_dir(report_filename) with open(report_filename, "w") as fout: for l in out_lines: fout.write(f"{l}\n") self.hyp_sents, self.ref_sents, self.src_sents = [], [], []
def perform_inference(self, generator: 'models.GeneratorModel', src_file: str = None, trg_file: str = None) \ -> None: """ Perform inference. Args: generator: the model to be used src_file: path of input src file to be translated trg_file: path of file where trg translatons will be written """ src_file = src_file or self.src_file trg_file = trg_file or self.trg_file utils.make_parent_dir(trg_file) logger.info(f'Performing inference on {src_file}') ref_corpus, src_corpus = self._read_corpus(generator, src_file, mode=self.mode, ref_file=self.ref_file) event_trigger.set_train(False) ref_scores = None if self.mode == 'score': ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents) self._write_rescored_output(ref_scores, self.ref_file, trg_file) if self.mode == 'forceddebug': ref_scores = self._compute_losses(generator, ref_corpus, src_corpus, self.max_num_sents) if self.mode != 'score': self._generate_output(generator=generator, forced_ref_corpus=ref_corpus, assert_scores=ref_scores, src_corpus=src_corpus, trg_file=trg_file, batcher=self.batcher, max_src_len=self.max_src_len)
def __init__(self, report_path: str, src_vocab=Ref(Path("model.src_reader.vocab"))): self.src_vocab = src_vocab self.logger = logging.getLogger("segmenting_reporter") utils.make_parent_dir(report_path) self.logger.addHandler(logging.StreamHandler(open(report_path, "w"))) self.logger.setLevel("INFO")
def write_html(self) -> None: html_str = "\n".join(self.html_contents) soup = bs(html_str, "lxml") pretty_html = soup.prettify() html_file_name = os.path.join(self.report_path, f"{self.report_name}.html") utils.make_parent_dir(html_file_name) with open(html_file_name, 'w', encoding='utf-8') as f: f.write(pretty_html)
def plot_attention(src_words, trg_words, attention_matrix, file_name, size_x=8.0, size_y=8.0): """This takes in source and target words and an attention matrix (in numpy format) and prints a visualization of this to a file. Args: src_words: a list of words in the source trg_words: a list of target words attention_matrix: a two-dimensional numpy array of values between zero and one, where rows correspond to source words, and columns correspond to target words file_name: the name of the file to which we write the attention size_x: width of the main plot size_y: height of the plot """ trg_words = [unidecode(w) for w in trg_words] src_is_speech = isinstance(src_words, np.ndarray) max_len = len(''.join(trg_words)) if not src_is_speech: max_len = max(max_len, len(''.join(src_words))) src_words = [unidecode(w) for w in src_words] if max_len > 150: matplotlib.rc('font', size=5) elif max_len > 50: matplotlib.rc('font', size=7) dpi = 100 if max_len <= 150 else 150 fig, axs = plt.subplots( nrows=1, ncols=2 if src_is_speech else 1, figsize=(size_x + (1.0 if src_is_speech else 0.0), size_y), gridspec_kw={'width_ratios': [1, size_x]} if src_is_speech else None) ax = axs[1] if src_is_speech else axs # put the major ticks at the middle of each cell ax.set_xticks(np.arange(attention_matrix.shape[1]) + 0.5, minor=False) ax.set_yticks(np.arange(attention_matrix.shape[0]) + 0.5, minor=False) ax.invert_yaxis() if src_is_speech: plt.yticks([], []) # label axes by words ax.set_xticklabels(trg_words, minor=False) if not src_is_speech: ax.set_yticklabels(src_words, minor=False) ax.xaxis.tick_top() # draw the heatmap plt.pcolor(attention_matrix, cmap=plt.cm.Blues, vmin=0, vmax=1) plt.colorbar() if src_is_speech: ax = axs[0] plot_speech_features(feature_matrix=src_words, ax=ax, dpi=dpi) fig.tight_layout() utils.make_parent_dir(file_name) plt.savefig(file_name, dpi=dpi) plt.close()
def setUp(self): events.clear() xnmt.resolved_serialize_params = {} yaml.add_representer(DummyArgClass, xnmt.init_representer) yaml.add_representer(DummyArgClass2, xnmt.init_representer) self.out_dir = os.path.join("test", "tmp") utils.make_parent_dir(os.path.join(self.out_dir, "asdf")) self.model_file = os.path.join(self.out_dir, "saved.mod") param_collections.ParamManager.init_param_col() param_collections.ParamManager.param_col.model_file = self.model_file
def conclude_report(self) -> None: if self.hyp_sents: html_filename = os.path.join(self.report_path, "charcut.html") utils.make_parent_dir(html_filename) args = utils.ArgClass(html_output_file=html_filename, match_size=self.match_size, alt_norm=self.alt_norm) aligned_segs = charcut.load_input_segs(cand_segs=self.hyp_sents, ref_segs=self.ref_sents, src_segs=self.src_sents) charcut.run_on(aligned_segs, args) self.hyp_sents, self.ref_sents, self.src_sents = [], [], []
def create_sent_report(self, segment_actions, src, **kwargs): if self.report_fp is None: report_path = os.path.join(self.report_path, "segment.txt") utils.make_parent_dir(report_path) self.report_fp = open(report_path, "w") actions = segment_actions[0] src = src.str_tokens() words = [] start = 0 for end in actions: words.append("".join(str(src[start:end + 1]))) start = end + 1 print(" ".join(words), file=self.report_fp)
def create_sent_report(self, segment_actions, src: sent.Sentence, **kwargs): if self.report_fp is None: utils.make_parent_dir(self.report_path) self.report_fp = open(self.report_path, "w") actions = segment_actions[0] src = src.str_tokens() words = [] start = 0 for end in actions: if start < end + 1: words.append("".join(map(str, src[start:end + 1]))) start = end + 1 print(" ".join(words), file=self.report_fp)
def run_preproc_task(self, overwrite: bool = False) -> None: tokenizers = { my_opts["filenum"]: [tok for tok in my_opts["tokenizers"]] for my_opts in self.specs } for file_num, (in_file, out_file) in enumerate( zip(self.in_files, self.out_files)): if overwrite or not os.path.isfile(out_file): utils.make_parent_dir(out_file) my_tokenizers = tokenizers.get(file_num, tokenizers["all"]) with open(out_file, "w", encoding='utf-8') as out_stream, \ open(in_file, "r", encoding='utf-8') as in_stream: for tokenizer in my_tokenizers: in_stream = tokenizer.tokenize_stream(in_stream) for line in in_stream: out_stream.write(f"{line}\n")
def __init__(self, report_path: str = None, src_vocab=Ref(Path("model.src_reader.vocab")), trg_vocab=Ref(Path("model.trg_reader.vocab"))): self.src_vocab = src_vocab self.trg_vocab = trg_vocab self.logger = logging.getLogger("simult") if report_path is not None: utils.make_parent_dir(report_path) stream = open(report_path, "w") else: stream = sys.stderr self.logger.addHandler(logging.StreamHandler(stream)) self.logger.setLevel("INFO")
def run_preproc_task(self, overwrite: bool = False) -> None: normalizers = { my_opts["filenum"]: [norm for norm in my_opts["normalizers"]] for my_opts in self.specs } for i, (in_file, out_file) in enumerate(zip(self.in_files, self.out_files)): if overwrite or not os.path.isfile(out_file): utils.make_parent_dir(out_file) my_normalizers = normalizers.get(i, normalizers["all"]) with open(out_file, "w", encoding='utf-8') as out_stream, \ open(in_file, "r", encoding='utf-8') as in_stream: for line in in_stream: line = line.strip() for normalizer in my_normalizers: line = normalizer.normalize(line) out_stream.write(line + "\n")
def run_preproc_task(self, overwrite: bool = False) -> None: filters = { my_opts["filenum"]: [norm for norm in my_opts["filters"]] for my_opts in self.specs } for i, (in_file, out_file) in enumerate(zip(self.in_files, self.out_files)): if overwrite or not os.path.isfile(out_file): utils.make_parent_dir(out_file) with open(out_file, "w", encoding='utf-8') as out_stream, \ open(in_file, "r", encoding='utf-8') as in_stream: vocab = {} for line in in_stream: for word in line.strip().split(): vocab[word] = vocab.get(word, 0) + 1 for my_filter in filters.get(i, filters["all"]): vocab = my_filter.filter(vocab) for word in vocab.keys(): out_stream.write((word + u"\n"))
def __init__(self, train_files: Sequence[str], vocab_size: numbers.Integral, overwrite: bool = False, model_prefix: str = 'sentpiece', output_format: str = 'piece', model_type: str = 'bpe', hard_vocab_limit: bool = True, encode_extra_options: Optional[str] = None, decode_extra_options: Optional[str] = None) -> None: """ This will initialize and train the sentencepiece tokenizer. If overwrite is set to False, learned model will not be overwritten, even if parameters are changed. "File" output for Sentencepiece written to StringIO temporarily before being written to disk. """ self.model_prefix = model_prefix self.output_format = output_format self.input_format = output_format self.overwrite = overwrite self.encode_extra_options = [ '--extra_options=' + encode_extra_options ] if encode_extra_options else [] self.decode_extra_options = [ '--extra_options=' + decode_extra_options ] if decode_extra_options else [] utils.make_parent_dir(model_prefix) self.sentpiece_train_args = [ '--input=' + ','.join(train_files), '--model_prefix=' + str(model_prefix), '--vocab_size=' + str(vocab_size), '--hard_vocab_limit=' + str(hard_vocab_limit).lower(), '--model_type=' + str(model_type) ] self.sentpiece_processor = None
def setUp(self): yaml.add_representer(DummyClass, xnmt.init_representer) self.out_dir = "test/tmp" utils.make_parent_dir(f"{self.out_dir}/asdf")
def run_preproc_task(self, overwrite: bool = False) -> None: extractor = self.specs for in_file, out_file in zip(self.in_files, self.out_files): if overwrite or not os.path.isfile(out_file): utils.make_parent_dir(out_file) extractor.extract_to(in_file, out_file)