Exemplo n.º 1
0
    def test_to_repr_2_nosep(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 2,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.ONLY_NUMBERS)

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl["word_start"], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a',
            pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english',
            pl['word_end'], '*/', '//', pl["word_start"], pl['capitals'],
            pl['non_eng'], "8", pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Exemplo n.º 2
0
    def test_merges_no_cache(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 4,
            PrepParam.TABS_NEWLINES: 0,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.BPE,
            merges={('w', 'h'): 0},
            merges_cache={})

        tokens = [SplitContainer.from_single_token("While")]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], pl['capital'], "wh", "i", "l", "e",
            pl["word_end"]
        ]

        self.assertEqual(expected, actual)
Exemplo n.º 3
0
    def test_to_repr_no_no_sep_with_bpe_no_merges(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 4,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.BPE, merges=[], merges_cache={})

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], '"', pl['word_start'], pl['capitals'], 'a',
            pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'e', 'n', 'g',
            'l', 'i', 's', 'h', pl['word_end'], '*/', '//', pl['word_start'],
            pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Exemplo n.º 4
0
    def test_log_no_mark_logs(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 1,
            PrepParam.TABS_NEWLINES: 0,
            PrepParam.MARK_LOGS: 0,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig()

        tokens = [
            LogStatement(
                SplitContainer.from_single_token('LOGGER'),
                SplitContainer.from_single_token('Info'), INFO,
                [StringLiteral([SplitContainer.from_single_token("Hi")])])
        ]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['capitals'], 'logger', '.', pl['capital'], 'info', '(', '"',
            pl['capital'], 'hi', '"', ')', ';'
        ]

        self.assertEqual(expected, actual)
Exemplo n.º 5
0
    def test_to_repr_with_non_eng(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 3,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM,
            sc_splittings={
                'english': ['engl', 'ish'],
                'dieselbe': ['die', 'selbe']
            })

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*", 'dinero',
            '"', pl['word_start'], pl['capitals'], 'a', pl['capital'],
            'wirklich', pl['word_end'], '"', '/*', 'ц', pl['word_start'],
            'blanco', '_', 'engl', 'ish', pl['word_end'], '*/', '//',
            pl['word_start'], pl['capitals'], 'die', 'selbe', "8",
            pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Exemplo n.º 6
0
    def test_to_repr_with_enonlycontents(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 2,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 3,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM,
            sc_splittings={})

        tokens = [
            Number([1, DecimalPoint(), 1]), "*",
            SplitContainer([NonEng(Word.from_("dinero"))]),
            StringLiteral([
                NonEng(Word.from_("ich")),
                NonEng(Word.from_("weiss")),
                NonEng(Word.from_("nicht")),
                NonEng(Word.from_("was")),
                NonEng(Word.from_("soll")),
                NonEng(Word.from_("es")),
                NonEng(Word.from_("bedeuten")),
                NonEng(Word.from_("dass")),
                NonEng(Word.from_("ich")),
                NonEng(Word.from_("so")),
                NonEng(Word.from_("traurig")),
                NonEng(Word.from_("bin")),
            ]),
            NewLine(),
            MultilineComment([
                SplitContainer([NonEng(Word.from_('ц'))]),
                SplitContainer([
                    NonEng(Word.from_("blanco")),
                    Underscore(),
                    Word.from_("english")
                ])
            ]),
            NewLine(),
            Tab(),
            OneLineComment([
                SplitContainer(
                    [NonEng(Word.from_("DIESELBE")),
                     Word.from_("8")])
            ])
        ]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], '"', pl["non_eng_content"], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english',
            pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'],
            pl['non_eng'], "8", pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Exemplo n.º 7
0
def calc_stats_for_prepconfig(prepconfig,
                              lang_checker,
                              token_list,
                              include_sample=False):
    repr = to_token_list(
        to_repr(PrepConfig.from_encoded_string(prepconfig), token_list,
                NgramSplitConfig())).split(' ')
    return lang_checker.calc_lang_stats(repr, include_sample=include_sample)
Exemplo n.º 8
0
 def test_both_enonly_and_nosplit(self):
     with self.assertRaises(ValueError):
         prep_config = PrepConfig({
             PrepParam.EN_ONLY: 1,
             PrepParam.COM_STR: 0,
             PrepParam.SPLIT: 0,
             PrepParam.TABS_NEWLINES: 1,
             PrepParam.MARK_LOGS: 1,
             PrepParam.CAPS: 1
         })
         to_repr(prep_config, [], NgramSplitConfig())
Exemplo n.º 9
0
    def test_to_repr_0(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 0,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 0
        })

        actual = to_repr(prep_config, tokens, NgramSplitConfig())

        expected = [
            '1.1', "*", 'dinero', '"', 'AWirklich', '"', '/*', 'ц',
            'blanco_english', '*/', '//', "DIESELBE8", pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Exemplo n.º 10
0
    def test_to_repr_1_nosep(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 1,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        actual = to_repr(prep_config, tokens, NgramSplitConfig())

        expected = [
            '1.1', "*", pl['non_eng'], '"', pl['word_start'], pl['capitals'],
            'a', pl["capital"], pl['non_eng'], pl['word_end'], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english',
            pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'],
            pl['non_eng'], '8', pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Exemplo n.º 11
0
    def test_to_repr_no_str_no_com(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 1,
            PrepParam.COM_STR: 2,
            PrepParam.SPLIT: 3,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM,
            sc_splittings={'english': ['engl', 'ish']})

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], pl["string_literal"], pl["comment"], pl["comment"]
        ]

        self.assertEqual(expected, actual)
Exemplo n.º 12
0
    def test_1(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 0,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 4,
            PrepParam.TABS_NEWLINES: 0,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.BPE,
            merges_cache={'while': ['while']})

        tokens = [SplitContainer.from_single_token("While")]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['capital'],
            "while",
        ]

        self.assertEqual(expected, actual)
Exemplo n.º 13
0
def to_repr_l(lst):
    return to_repr(PrepConfig.from_encoded_string('000010'), lst,
                   NgramSplitConfig())
Exemplo n.º 14
0
def init_splitting_config(dataset: str, prep_config: PrepConfig,
                          bpe_base_repr: Optional[str], bpe_n_merges: Optional[int], splitting_file: Optional[str], merges_file):
    global global_n_gramm_splitting_config
    global_n_gramm_splitting_config = NgramSplitConfig()
    if prep_config.get_param_value(PrepParam.SPLIT) in [4, 5, 6, 7, 8, 9]:
        if merges_file:
            logger.info(f'Using bpe merges file: {merges_file}')
            global_n_gramm_splitting_config.merges_cache = []
            global_n_gramm_splitting_config.merges = read_merges(merges_file, bpe_n_merges)
            if bpe_n_merges:
                logger.info(f'Using first {bpe_n_merges} merges.')
        else:
            if not bpe_base_repr:
                bpe_base_repr = prep_config.get_base_bpe_prep_config()

            if prep_config.get_param_value(PrepParam.SPLIT) == 9:
                if not bpe_n_merges:
                    raise ValueError("--bpe-n-merges must be specified for repr **9**")
            else:
                bpe_n_merges_dict = {4: 5000, 5: 1000, 6: 10000, 7: 20000, 8: 0}
                bpe_n_merges = bpe_n_merges_dict[prep_config.get_param_value(PrepParam.SPLIT)]

            if bpe_base_repr.find("/") == -1:
                bpe_base_dataset = dataset
            else:
                bpe_base_dataset, bpe_base_repr = bpe_base_repr.split("/")
            logger.info(f'Using bpe base dataset: {bpe_base_dataset}')
            logger.info(f'Using bpe base repr: {bpe_base_repr}')
            logger.info(f'Using bpe_n_merges: {bpe_n_merges}')
            path_to_merges_dir = os.path.join(DEFAULT_PARSED_DATASETS_DIR, bpe_base_dataset, METADATA_DIR, bpe_base_repr,
                                              BPE_DIR,
                                              str(bpe_n_merges))
            bpe_merges_file = os.path.join(path_to_merges_dir, 'merges.txt')
            bpe_merges_cache = os.path.join(path_to_merges_dir, 'merges_cache.txt')

            global_n_gramm_splitting_config.merges_cache = read_dict_from_2_columns(bpe_merges_cache, val_type=list)
            global_n_gramm_splitting_config.merges = read_merges(bpe_merges_file)
        global_n_gramm_splitting_config.set_splitting_type(NgramSplittingType.BPE)
    elif prep_config.get_param_value(PrepParam.SPLIT) == 3:
        if not splitting_file:
            raise ValueError("--splitting-file must be specified")

        splittings = read_dict_from_2_columns(splitting_file, val_type=list, delim='|')
        global_n_gramm_splitting_config.sc_splittings = splittings
        global_n_gramm_splitting_config.set_splitting_type(NgramSplittingType.NUMBERS_AND_CUSTOM)
    elif prep_config.get_param_value(PrepParam.SPLIT) == 2:
        global_n_gramm_splitting_config.set_splitting_type(NgramSplittingType.ONLY_NUMBERS)
Exemplo n.º 15
0
    "0.345e+4": ["0.", "3", "4", "5", "e+", "4"],
    "modified": ["mod", "if", "ied"],

    "create": ["create"],
    "vector": ["vector"],
    "best": ["best"],
    "test": ["test"],
    "num": ["num"],
    "user": ["user"],
    "get": ["get"],
    "nick": ["ni", "ck"],
    "logger": ["logger"],
    "info": ["info"]
}

ngram_split_config = NgramSplitConfig(NgramSplittingType.BPE, merges_cache=bpe_merges_cache, merges={})


class SubwordSeparation(unittest.TestCase):
    def test(self):
        for input, output_tuple in test_cases.items():
            parsed = apply_preprocessors(from_string(input), pp_params["preprocessors"], {})

            self.assertEqual(output_tuple[0], parsed)

            repred = to_repr(PrepConfig.from_encoded_string('104111'), parsed, ngram_split_config)

            self.assertEqual(output_tuple[1], repred)


if __name__ == '__main__':