def test_create_partition_forced(self): st = create_setting() # cannot reuse create_sample_partition because we need pgc_files # for asserting pgc_files = relglob(st.corpus_dir, "news/pgc/ma/2006-11/*.pgc") self.assertTrue(pgc_files) forced_fns = ['news/pgc/ma/2006-11/news-2006-11-aligned-part-02.pgc'] corpus_fns, corpus_sizes, dev_parts, val_parts, dev_sizes, val_sizes = \ create_partition(pgc_files, corpus_dir=st.corpus_dir, dev_bins=4, val_bins=1, forced_fns=forced_fns) write_partition(corpus_fns, corpus_sizes, dev_parts, val_parts, dev_sizes, val_sizes) self.assertEqual(len(dev_parts), 4) self.assertEqual(len(val_parts), 1 + 1) dev_fns = set( part_fname for part_list in dev_parts for part_fname in part_list ) val_fns = set( part_fname for part_list in val_parts for part_fname in part_list ) # check if no files were lost self.assertEqual( len(dev_fns) + len(val_fns), len(pgc_files) ) # check for overlap self.assertTrue(dev_fns.isdisjoint(val_fns)) # check for forced files for forced_fname in forced_fns: self.assertTrue(forced_fname in val_fns)
def expand_globs(corpus_dir, globs): files = [] for pattern in globs: files.extend(relglob(corpus_dir, pattern)) return files
def create_sample_partition(setting): """ Create a sample partition and save as ./partition.py This assumes a sample corpus under "corpus" subdir """ corpus_fnames = relglob(setting.corpus_dir, "news/pgc/ma/2006-11/*.pgc") partition = create_partition(corpus_fnames, corpus_dir=setting.corpus_dir, dev_bins=4, val_bins=1) write_partition(*partition, out="partition.py")