Пример #1
0
 def test_create_partition_forced(self):
     st = create_setting()
     # cannot reuse create_sample_partition because we need pgc_files 
     # for asserting
     pgc_files = relglob(st.corpus_dir, "news/pgc/ma/2006-11/*.pgc")
     self.assertTrue(pgc_files)
     
     forced_fns = ['news/pgc/ma/2006-11/news-2006-11-aligned-part-02.pgc']
     
     corpus_fns, corpus_sizes, dev_parts, val_parts, dev_sizes, val_sizes = \
     create_partition(pgc_files, corpus_dir=st.corpus_dir, 
                      dev_bins=4, val_bins=1, forced_fns=forced_fns)
     
     write_partition(corpus_fns, corpus_sizes, dev_parts, val_parts,
                     dev_sizes, val_sizes)
     
     self.assertEqual(len(dev_parts), 4)
     self.assertEqual(len(val_parts), 1 + 1)
     
     dev_fns = set( part_fname for part_list in dev_parts 
                    for part_fname in part_list )
     val_fns = set( part_fname for part_list in val_parts 
                    for part_fname in part_list )
     
     # check if no files were lost
     self.assertEqual( len(dev_fns) + len(val_fns), 
                       len(pgc_files) )
     
     # check for overlap
     self.assertTrue(dev_fns.isdisjoint(val_fns))
     
     # check for forced files
     for forced_fname in forced_fns:
         self.assertTrue(forced_fname in val_fns)
Пример #2
0
def expand_globs(corpus_dir, globs):
    files = []

    for pattern in globs:
        files.extend(relglob(corpus_dir, pattern))

    return files
Пример #3
0
def expand_globs(corpus_dir, globs):
    files = []

    for pattern in globs:
        files.extend(relglob(corpus_dir, pattern))

    return files
Пример #4
0
def create_sample_partition(setting):
    """
    Create a sample partition and save as ./partition.py
    
    This assumes a sample corpus under "corpus" subdir 
    """
    corpus_fnames = relglob(setting.corpus_dir, "news/pgc/ma/2006-11/*.pgc")
    partition = create_partition(corpus_fnames, corpus_dir=setting.corpus_dir,
                                 dev_bins=4, val_bins=1)
    write_partition(*partition, out="partition.py")