def test_2(self): text = ''' int[] _my_favoRite_ints_ = {0x12, 0x1fE, 441, -81, -0xfFf}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('int'), '[', ']', SplitContainer([ Underscore(), Word.from_('my'), Underscore(), NonEng(Word.from_('favo')), Word.from_('Rite'), Underscore(), Word.from_('ints'), Underscore() ]), '=', '{', Number([HexStart(), '1', '2']), ',', Number([HexStart(), '1', 'f', 'E']), ',', Number(['4', '4', '1']), ',', Number(['-', '8', '1']), ',', Number(['-', HexStart(), 'f', 'F', 'f']), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_7(self): text = ''' /*multi-line MyComment_ *// _operations ''' expected_result = [ NewLine(), MultilineComment([ SplitContainer.from_single_token('multi'), '-', SplitContainer.from_single_token('line'), SplitContainer( [Word.from_('My'), Word.from_('Comment'), Underscore()]), NewLine() ]), '/', NewLine(), SplitContainer([Underscore(), Word.from_('operations')]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_with_numbers_split(self): token = [StringLiteral([":", ParseableToken("_test_my123GmyClass_")])] actual = simple_split(token, {}) expected = [StringLiteral([":", SplitContainer([ Underscore(), Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("123"), Word.from_("Gmy"), Word.from_("Class"), Underscore() ])])] self.assertEqual(actual, expected)
def test_mark_with_noneng(self): tokens = [ StringLiteral( [SplitContainer([Word.from_("A"), Word.from_("Wirklich")])]), MultilineComment([ SplitContainer.from_single_token('ц'), SplitContainer([ Word.from_("blanco"), Underscore(), Word.from_("english") ]) ]), OneLineComment( [SplitContainer([Word.from_("DIESELBE"), Word.from_("8")])]) ] actual = mark(tokens, {}) expected = [ StringLiteral([ SplitContainer( [Word.from_("A"), NonEng(Word.from_("Wirklich"))]) ]), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ # we have to call constructor manually here, # case split container cannot set wordStart prefix # when the first subword is wrapped in NonEng NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), OneLineComment([ SplitContainer([ # we have to call constructor manually here, # case split container cannot set wordStart prefix # when the first subword is wrapped in NonEng NonEng(Word.from_("DIESELBE")), Word.from_("8") ]) ]) ] self.assertEqual(expected, actual)
def test_to_repr_with_enonlycontents(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 2, PrepParam.COM_STR: 0, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={}) tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral([ NonEng(Word.from_("ich")), NonEng(Word.from_("weiss")), NonEng(Word.from_("nicht")), NonEng(Word.from_("was")), NonEng(Word.from_("soll")), NonEng(Word.from_("es")), NonEng(Word.from_("bedeuten")), NonEng(Word.from_("dass")), NonEng(Word.from_("ich")), NonEng(Word.from_("so")), NonEng(Word.from_("traurig")), NonEng(Word.from_("bin")), ]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment([ SplitContainer( [NonEng(Word.from_("DIESELBE")), Word.from_("8")]) ]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng_content"], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_capitals(self): text = ''' MyClass Class CONSTANT VAR_WITH_UNDERSCORES ''' expected_result = [ NewLine(), SplitContainer([Word.from_("My"), Word.from_("Class")]), SplitContainer.from_single_token("Class"), SplitContainer.from_single_token("CONSTANT"), SplitContainer([ Word.from_("VAR"), Underscore(), Word.from_("WITH"), Underscore(), Word.from_("UNDERSCORES") ]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_no_logs(self): input = [NewLine(), SplitContainer.from_single_token('long'), '[', ']', SplitContainer([Word.from_('lovely'), Underscore(), Word.from_('longs')]), '=', '{', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L ()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def simple_split_token(token): if isinstance(token, ParseableToken): parts = [ m[0] for m in regex.finditer( '(_|[0-9]+|[[:upper:]]?[[:lower:]]+|[[:upper:]]+(?![[:lower:]]))', str(token)) ] # if len("".join(parts)) == processable_tokens = [ Word.from_(p) if p != '_' else Underscore() for p in parts ] return SplitContainer(processable_tokens) elif isinstance(token, ProcessableTokenContainer): return type(token)([ simple_split_token(subtoken) for subtoken in token.get_subtokens() ]) else: return token
def test_1(self): text = ''' long[] lovely_longs = {0x34a35EL, 0x88bc96fl , -0x34L}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('long'), '[', ']', SplitContainer( [Word.from_('lovely'), Underscore(), Word.from_('longs')]), '=', '{', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()]), ',', Tab(), Number([HexStart(), '8', '8', 'b', 'c', '9', '6', 'f', L()]), Tab(), Tab(), ',', Number(['-', HexStart(), '3', '4', L()]), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_process_comments_and_str_literals(self): ''' Positive scenario <start>"//test_MyClass" //*/ "/*!" /* /* <end> ''' tokens = [ Quote(), OneLineCommentStart(), SplitContainer([ Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("Class") ]), Quote(), NewLine(), OneLineCommentStart(), MultilineCommentEnd(), NewLine(), Quote(), MultilineCommentStart(), SplitContainer.from_single_token("!"), Quote(), NewLine(), MultilineCommentStart(), NewLine(), MultilineCommentEnd(), NewLine(), ] actual = process_comments_and_str_literals(tokens, {}) expected = [ StringLiteral([ OneLineCommentStart(), SplitContainer([ Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("Class") ], ) ]), NewLine(), OneLineComment([MultilineCommentEnd()]), NewLine(), StringLiteral([ MultilineCommentStart(), SplitContainer.from_single_token("!") ]), NewLine(), MultilineComment([NewLine()]), NewLine() ] self.assertEqual(expected, actual)
from logrec.dataprep.to_repr import to_repr pl = placeholders tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral( [SplitContainer([Word.from_("A"), NonEng(Word.from_("Wirklich"))])]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment( [SplitContainer([NonEng(Word.from_("DIESELBE")), Word.from_("8")])]) ] class TeprTest(unittest.TestCase): def test_both_enonly_and_nosplit(self): with self.assertRaises(ValueError): prep_config = PrepConfig({
[placeholders["capital"], "vector"], ), "players": ( [SplitContainer.from_single_token("players")], [placeholders["word_start"], 'play', 'er', 's', placeholders["word_end"]] ), "0.345e+4": ( [Number(["0", DecimalPoint(), "3", "4", "5", E(), "+", "4"])], [placeholders["word_start"], "0.", "3", "4", "5", "e+", "4", placeholders["word_end"]] ), "bestPlayers": ( [SplitContainer([Word.from_("best"), Word.from_("Players")])], [placeholders["word_start"], "best", placeholders["capital"], 'play', "er", "s", placeholders["word_end"]] ), "test_BestPlayers": ( [SplitContainer([Word.from_("test"), Underscore(), Word.from_("Best"), Word.from_("Players")])], [placeholders["word_start"], "test", '_', placeholders["capital"], "best", placeholders["capital"], 'play', "er", "s", placeholders["word_end"]] ), "test_BestPlayers_modified": ( [SplitContainer( [Word.from_("test"), Underscore(), Word.from_("Best"), Word.from_("Players"), Underscore(), Word.from_("modified")] )], [placeholders["word_start"], "test", '_', placeholders["capital"], "best", placeholders["capital"], 'play', "er", "s", '_', "mod", "if", "ied", placeholders["word_end"]] ), "N_PLAYERS_NUM": ( [SplitContainer([Word.from_("N"), Underscore(), Word.from_("PLAYERS"), Underscore(), Word.from_("NUM")])],