def postprocess_data(data_path, out_dir_path, min_revs_per_file=None, workers=1, max_revs_per_file=9, early_term=None, logging_period=1000): """ Creates `K` reviews per group files, computes ROUGE 1 vs rest. In this case, avoids an expensive online computation of ROUGE. """ logger = init_logger("", output_path=os.path.dirname(out_dir_path)) dt = MosesDetokenizer() detok_func = lambda x: [ dt.detokenize(_x.split(" "), unescape=False) for _x in x ] data_pipeline = assemble_postproc_pipeline( text_prep_func=detok_func, seed=seed, min_revs_per_group=min_revs_per_file, max_revs_per_group=max_revs_per_file, workers=workers) logger.info("Writing chunks to: '%s'." % out_dir_path) safe_mkdir(out_dir_path) chunks_count = 0 start = time() unique_groups = set() review_count = 0 min_rev_per_chunk = float('inf') max_rev_per_chunk = float('-inf') for dc in data_pipeline.iter(data_path=data_path, early_term=early_term): assert len(np.unique(dc[InpDataF.GROUP_ID])) == 1 group_id = dc[0, InpDataF.GROUP_ID].split("_")[0] unique_groups.add(group_id) review_count += len(dc) min_rev_per_chunk = min(min_rev_per_chunk, len(dc)) max_rev_per_chunk = max(max_rev_per_chunk, len(dc)) fp = comb_paths(out_dir_path, "%s.csv" % dc[0][InpDataF.GROUP_ID]) dc.to_csv(open(fp, encoding='utf-8', mode='w')) chunks_count += 1 if chunks_count % logging_period == 0: logger.info("Wrote %d chunks." % chunks_count) logger.info("Totally wrote %d chunks." % chunks_count) logger.info("Total time elapsed: %f." % (time() - start)) logger.info("Unique groups: %d." % len(unique_groups)) logger.info("Total reviews: %d." % review_count) logger.info("Min reviews per chunk: %d." % min_rev_per_chunk) logger.info("Max reviews per chunk: %d." % max_rev_per_chunk)
from fewsum.data_pipelines.assemblers import assemble_vocab_pipeline from fewsum.utils.fields import InpDataF from sacremoses import MosesTruecaser import argparse from functools import partial def create_word_vocab(vocab_fp, data_path, truecaser_fp): """Creates a vocabulary using a vocabulary specific pipeline.""" tcaser = MosesTruecaser(load_from=truecaser_fp, is_asr=True) tcase_func = partial(tcaser.truecase, return_str=True, use_known=True) tok_func = lambda x: tcase_func(x).split() vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT, lowercase=False, tok_func=tok_func) word_vocab = Vocabulary(vocab_pipeline, name_prefix="word", special_tokens=SPECIAL_TOKENS) word_vocab.create(data_source={'data_path': data_path}, data_fnames=InpDataF.REV_TEXT) word_vocab.write(vocab_fp, sep=' ') if __name__ == '__main__': logger = init_logger("") parser = argparse.ArgumentParser() parser.add_argument("--vocab_fp", type=str) parser.add_argument('--data_path', type=str, nargs='+') parser.add_argument('--truecaser_fp', type=str) args = parser.parse_args() create_word_vocab(**vars(args))
type=str, help='Sets the regime of training/inference.', required=True) parser.add_argument( '--inference', action='store_true', help='If set, will perform inference/summary generation otherwise training.' ) regime = parser.parse_args().regime inference = parser.parse_args().inference run_conf = RUN_CONFIG_REGISTRY[regime]() logger = init_logger(logger_name="", level=INFO, output_path=comb_paths(run_conf.output_path, "log.txt")) # ENV and hyper-params handling # manual_seed(run_conf.seed) np.random.seed(run_conf.seed) cuda_visible_devices = str(run_conf.cuda_device_ids) \ if isinstance(run_conf.cuda_device_ids, int) else \ ",".join([str(dev_id) for dev_id in run_conf.cuda_device_ids]) os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices device_count = 1 if not isinstance(run_conf.cuda_device_ids, list) \ else max(1, len(run_conf.cuda_device_ids)) device = 'cuda' if len(run_conf.cuda_device_ids) > 0 else 'cpu' logger.info('CUDA_VISIBLE_DEVICES=%s' % cuda_visible_devices) # DATA SOURCES #
from argparse import ArgumentParser from sacremoses import MosesTruecaser from mltoolkit.mldp.steps.readers import CsvReader from mltoolkit.mlutils.helpers.paths_and_files import safe_mkfdir from mltoolkit.mlutils.helpers.logging_funcs import init_logger from csv import QUOTE_NONE import os logger_name = os.path.basename(__file__) logger = init_logger(logger_name) def train_and_save_true_casing_model(input_fps, text_fname, output_fp): """Trains the Moses model on tokenized csv files; saves params.""" mtr = MosesTruecaser(is_asr=True) reader = CsvReader(quoting=QUOTE_NONE, sep='\t', engine='python', encoding='utf-8') texts = [] logger.info("Loading data from: '%s'." % input_fps) for dc in reader.iter(data_path=input_fps): for du in dc.iter(): texts.append(du[text_fname].split()) logger.info("Loaded the data.") safe_mkfdir(output_fp) logger.info("Training the truecaser.") mtr.train(texts, save_to=output_fp, progress_bar=True, processes=1) logger.info("Done, saved the model to: '%s'." % output_fp)