예제 #1
0
def postprocess_data(data_path,
                     out_dir_path,
                     min_revs_per_file=None,
                     workers=1,
                     max_revs_per_file=9,
                     early_term=None,
                     logging_period=1000):
    """
    Creates `K` reviews per group files, computes ROUGE 1 vs rest. In this case,
    avoids an expensive online computation of ROUGE.
    """
    logger = init_logger("", output_path=os.path.dirname(out_dir_path))
    dt = MosesDetokenizer()
    detok_func = lambda x: [
        dt.detokenize(_x.split(" "), unescape=False) for _x in x
    ]
    data_pipeline = assemble_postproc_pipeline(
        text_prep_func=detok_func,
        seed=seed,
        min_revs_per_group=min_revs_per_file,
        max_revs_per_group=max_revs_per_file,
        workers=workers)
    logger.info("Writing chunks to: '%s'." % out_dir_path)
    safe_mkdir(out_dir_path)
    chunks_count = 0
    start = time()
    unique_groups = set()
    review_count = 0
    min_rev_per_chunk = float('inf')
    max_rev_per_chunk = float('-inf')
    for dc in data_pipeline.iter(data_path=data_path, early_term=early_term):
        assert len(np.unique(dc[InpDataF.GROUP_ID])) == 1
        group_id = dc[0, InpDataF.GROUP_ID].split("_")[0]
        unique_groups.add(group_id)
        review_count += len(dc)
        min_rev_per_chunk = min(min_rev_per_chunk, len(dc))
        max_rev_per_chunk = max(max_rev_per_chunk, len(dc))
        fp = comb_paths(out_dir_path, "%s.csv" % dc[0][InpDataF.GROUP_ID])
        dc.to_csv(open(fp, encoding='utf-8', mode='w'))
        chunks_count += 1
        if chunks_count % logging_period == 0:
            logger.info("Wrote %d chunks." % chunks_count)
    logger.info("Totally wrote %d chunks." % chunks_count)
    logger.info("Total time elapsed: %f." % (time() - start))
    logger.info("Unique groups: %d." % len(unique_groups))
    logger.info("Total reviews: %d." % review_count)
    logger.info("Min reviews per chunk: %d." % min_rev_per_chunk)
    logger.info("Max reviews per chunk: %d." % max_rev_per_chunk)
예제 #2
0
from fewsum.data_pipelines.assemblers import assemble_vocab_pipeline
from fewsum.utils.fields import InpDataF
from sacremoses import MosesTruecaser
import argparse
from functools import partial


def create_word_vocab(vocab_fp, data_path, truecaser_fp):
    """Creates a vocabulary using a vocabulary specific pipeline."""
    tcaser = MosesTruecaser(load_from=truecaser_fp, is_asr=True)
    tcase_func = partial(tcaser.truecase, return_str=True, use_known=True)
    tok_func = lambda x: tcase_func(x).split()
    vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT,
                                             lowercase=False, tok_func=tok_func)
    word_vocab = Vocabulary(vocab_pipeline, name_prefix="word",
                            special_tokens=SPECIAL_TOKENS)

    word_vocab.create(data_source={'data_path': data_path},
                      data_fnames=InpDataF.REV_TEXT)
    word_vocab.write(vocab_fp, sep=' ')


if __name__ == '__main__':
    logger = init_logger("")
    parser = argparse.ArgumentParser()
    parser.add_argument("--vocab_fp", type=str)
    parser.add_argument('--data_path', type=str, nargs='+')
    parser.add_argument('--truecaser_fp', type=str)
    args = parser.parse_args()
    create_word_vocab(**vars(args))
예제 #3
0
                    type=str,
                    help='Sets the regime of training/inference.',
                    required=True)
parser.add_argument(
    '--inference',
    action='store_true',
    help='If set, will perform inference/summary generation otherwise training.'
)

regime = parser.parse_args().regime
inference = parser.parse_args().inference

run_conf = RUN_CONFIG_REGISTRY[regime]()

logger = init_logger(logger_name="",
                     level=INFO,
                     output_path=comb_paths(run_conf.output_path, "log.txt"))

#   ENV and hyper-params handling  #
manual_seed(run_conf.seed)
np.random.seed(run_conf.seed)
cuda_visible_devices = str(run_conf.cuda_device_ids) \
    if isinstance(run_conf.cuda_device_ids, int) else \
    ",".join([str(dev_id) for dev_id in run_conf.cuda_device_ids])
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
device_count = 1 if not isinstance(run_conf.cuda_device_ids, list) \
    else max(1, len(run_conf.cuda_device_ids))
device = 'cuda' if len(run_conf.cuda_device_ids) > 0 else 'cpu'
logger.info('CUDA_VISIBLE_DEVICES=%s' % cuda_visible_devices)

#   DATA SOURCES   #
예제 #4
0
from argparse import ArgumentParser
from sacremoses import MosesTruecaser
from mltoolkit.mldp.steps.readers import CsvReader
from mltoolkit.mlutils.helpers.paths_and_files import safe_mkfdir
from mltoolkit.mlutils.helpers.logging_funcs import init_logger
from csv import QUOTE_NONE
import os

logger_name = os.path.basename(__file__)
logger = init_logger(logger_name)


def train_and_save_true_casing_model(input_fps, text_fname, output_fp):
    """Trains the Moses model on tokenized csv files; saves params."""
    mtr = MosesTruecaser(is_asr=True)
    reader = CsvReader(quoting=QUOTE_NONE,
                       sep='\t',
                       engine='python',
                       encoding='utf-8')
    texts = []
    logger.info("Loading data from: '%s'." % input_fps)
    for dc in reader.iter(data_path=input_fps):
        for du in dc.iter():
            texts.append(du[text_fname].split())
    logger.info("Loaded the data.")
    safe_mkfdir(output_fp)
    logger.info("Training the truecaser.")
    mtr.train(texts, save_to=output_fp, progress_bar=True, processes=1)
    logger.info("Done, saved the model to: '%s'." % output_fp)