def test_to_repr_2_nosep(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 2, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.ONLY_NUMBERS) actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl["word_start"], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a', pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl["word_start"], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_merges_no_cache(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 4, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.BPE, merges={('w', 'h'): 0}, merges_cache={}) tokens = [SplitContainer.from_single_token("While")] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], pl['capital'], "wh", "i", "l", "e", pl["word_end"] ] self.assertEqual(expected, actual)
def test_to_repr_no_no_sep_with_bpe_no_merges(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 4, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.BPE, merges=[], merges_cache={}) actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a', pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'e', 'n', 'g', 'l', 'i', 's', 'h', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_log_no_mark_logs(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 1, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 0, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig() tokens = [ LogStatement( SplitContainer.from_single_token('LOGGER'), SplitContainer.from_single_token('Info'), INFO, [StringLiteral([SplitContainer.from_single_token("Hi")])]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['capitals'], 'logger', '.', pl['capital'], 'info', '(', '"', pl['capital'], 'hi', '"', ')', ';' ] self.assertEqual(expected, actual)
def test_to_repr_with_non_eng(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={ 'english': ['engl', 'ish'], 'dieselbe': ['die', 'selbe'] }) actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", 'dinero', '"', pl['word_start'], pl['capitals'], 'a', pl['capital'], 'wirklich', pl['word_end'], '"', '/*', 'ц', pl['word_start'], 'blanco', '_', 'engl', 'ish', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], 'die', 'selbe', "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_to_repr_with_enonlycontents(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 2, PrepParam.COM_STR: 0, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={}) tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral([ NonEng(Word.from_("ich")), NonEng(Word.from_("weiss")), NonEng(Word.from_("nicht")), NonEng(Word.from_("was")), NonEng(Word.from_("soll")), NonEng(Word.from_("es")), NonEng(Word.from_("bedeuten")), NonEng(Word.from_("dass")), NonEng(Word.from_("ich")), NonEng(Word.from_("so")), NonEng(Word.from_("traurig")), NonEng(Word.from_("bin")), ]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment([ SplitContainer( [NonEng(Word.from_("DIESELBE")), Word.from_("8")]) ]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng_content"], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def calc_stats_for_prepconfig(prepconfig, lang_checker, token_list, include_sample=False): repr = to_token_list( to_repr(PrepConfig.from_encoded_string(prepconfig), token_list, NgramSplitConfig())).split(' ') return lang_checker.calc_lang_stats(repr, include_sample=include_sample)
def test_both_enonly_and_nosplit(self): with self.assertRaises(ValueError): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 0, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) to_repr(prep_config, [], NgramSplitConfig())
def test_to_repr_0(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 0, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 0 }) actual = to_repr(prep_config, tokens, NgramSplitConfig()) expected = [ '1.1', "*", 'dinero', '"', 'AWirklich', '"', '/*', 'ц', 'blanco_english', '*/', '//', "DIESELBE8", pl['olc_end'] ] self.assertEqual(expected, actual)
def test_to_repr_1_nosep(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 1, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) actual = to_repr(prep_config, tokens, NgramSplitConfig()) expected = [ '1.1', "*", pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a', pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], '8', pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_to_repr_no_str_no_com(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 2, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={'english': ['engl', 'ish']}) actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], pl["string_literal"], pl["comment"], pl["comment"] ] self.assertEqual(expected, actual)
def test_1(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 0, PrepParam.COM_STR: 0, PrepParam.SPLIT: 4, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.BPE, merges_cache={'while': ['while']}) tokens = [SplitContainer.from_single_token("While")] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['capital'], "while", ] self.assertEqual(expected, actual)
def to_repr_l(lst): return to_repr(PrepConfig.from_encoded_string('000010'), lst, NgramSplitConfig())
def init_splitting_config(dataset: str, prep_config: PrepConfig, bpe_base_repr: Optional[str], bpe_n_merges: Optional[int], splitting_file: Optional[str], merges_file): global global_n_gramm_splitting_config global_n_gramm_splitting_config = NgramSplitConfig() if prep_config.get_param_value(PrepParam.SPLIT) in [4, 5, 6, 7, 8, 9]: if merges_file: logger.info(f'Using bpe merges file: {merges_file}') global_n_gramm_splitting_config.merges_cache = [] global_n_gramm_splitting_config.merges = read_merges(merges_file, bpe_n_merges) if bpe_n_merges: logger.info(f'Using first {bpe_n_merges} merges.') else: if not bpe_base_repr: bpe_base_repr = prep_config.get_base_bpe_prep_config() if prep_config.get_param_value(PrepParam.SPLIT) == 9: if not bpe_n_merges: raise ValueError("--bpe-n-merges must be specified for repr **9**") else: bpe_n_merges_dict = {4: 5000, 5: 1000, 6: 10000, 7: 20000, 8: 0} bpe_n_merges = bpe_n_merges_dict[prep_config.get_param_value(PrepParam.SPLIT)] if bpe_base_repr.find("/") == -1: bpe_base_dataset = dataset else: bpe_base_dataset, bpe_base_repr = bpe_base_repr.split("/") logger.info(f'Using bpe base dataset: {bpe_base_dataset}') logger.info(f'Using bpe base repr: {bpe_base_repr}') logger.info(f'Using bpe_n_merges: {bpe_n_merges}') path_to_merges_dir = os.path.join(DEFAULT_PARSED_DATASETS_DIR, bpe_base_dataset, METADATA_DIR, bpe_base_repr, BPE_DIR, str(bpe_n_merges)) bpe_merges_file = os.path.join(path_to_merges_dir, 'merges.txt') bpe_merges_cache = os.path.join(path_to_merges_dir, 'merges_cache.txt') global_n_gramm_splitting_config.merges_cache = read_dict_from_2_columns(bpe_merges_cache, val_type=list) global_n_gramm_splitting_config.merges = read_merges(bpe_merges_file) global_n_gramm_splitting_config.set_splitting_type(NgramSplittingType.BPE) elif prep_config.get_param_value(PrepParam.SPLIT) == 3: if not splitting_file: raise ValueError("--splitting-file must be specified") splittings = read_dict_from_2_columns(splitting_file, val_type=list, delim='|') global_n_gramm_splitting_config.sc_splittings = splittings global_n_gramm_splitting_config.set_splitting_type(NgramSplittingType.NUMBERS_AND_CUSTOM) elif prep_config.get_param_value(PrepParam.SPLIT) == 2: global_n_gramm_splitting_config.set_splitting_type(NgramSplittingType.ONLY_NUMBERS)
"0.345e+4": ["0.", "3", "4", "5", "e+", "4"], "modified": ["mod", "if", "ied"], "create": ["create"], "vector": ["vector"], "best": ["best"], "test": ["test"], "num": ["num"], "user": ["user"], "get": ["get"], "nick": ["ni", "ck"], "logger": ["logger"], "info": ["info"] } ngram_split_config = NgramSplitConfig(NgramSplittingType.BPE, merges_cache=bpe_merges_cache, merges={}) class SubwordSeparation(unittest.TestCase): def test(self): for input, output_tuple in test_cases.items(): parsed = apply_preprocessors(from_string(input), pp_params["preprocessors"], {}) self.assertEqual(output_tuple[0], parsed) repred = to_repr(PrepConfig.from_encoded_string('104111'), parsed, ngram_split_config) self.assertEqual(output_tuple[1], repred) if __name__ == '__main__':