def test_mark_all_eng(self): ''' All words are english. Nothing changed ''' tokens = [ StringLiteral([ OneLineCommentStart(), SplitContainer([ Word.from_("test"), Word.from_("my"), Word.from_("class") ]) ]), NewLine(), OneLineComment([ MultilineCommentEnd(), SplitContainer.from_single_token("lifeisgood") ]), NewLine(), StringLiteral([ MultilineCommentStart(), SplitContainer.from_single_token("!") ]), NewLine(), MultilineComment([NewLine()]), NewLine() ] actual = mark(tokens, {}) self.assertEqual(actual, tokens)
def test_with_numbers_split(self): token = [StringLiteral([":", ParseableToken("_test_my123GmyClass_")])] actual = simple_split(token, {}) expected = [StringLiteral([":", SplitContainer([ Underscore(), Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("123"), Word.from_("Gmy"), Word.from_("Class"), Underscore() ])])] self.assertEqual(actual, expected)
def test_mark_with_noneng(self): tokens = [ StringLiteral( [SplitContainer([Word.from_("A"), Word.from_("Wirklich")])]), MultilineComment([ SplitContainer.from_single_token('ц'), SplitContainer([ Word.from_("blanco"), Underscore(), Word.from_("english") ]) ]), OneLineComment( [SplitContainer([Word.from_("DIESELBE"), Word.from_("8")])]) ] actual = mark(tokens, {}) expected = [ StringLiteral([ SplitContainer( [Word.from_("A"), NonEng(Word.from_("Wirklich"))]) ]), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ # we have to call constructor manually here, # case split container cannot set wordStart prefix # when the first subword is wrapped in NonEng NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), OneLineComment([ SplitContainer([ # we have to call constructor manually here, # case split container cannot set wordStart prefix # when the first subword is wrapped in NonEng NonEng(Word.from_("DIESELBE")), Word.from_("8") ]) ]) ] self.assertEqual(expected, actual)
def test_log_no_mark_logs(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 1, PrepParam.COM_STR: 0, PrepParam.SPLIT: 1, PrepParam.TABS_NEWLINES: 0, PrepParam.MARK_LOGS: 0, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig() tokens = [ LogStatement( SplitContainer.from_single_token('LOGGER'), SplitContainer.from_single_token('Info'), INFO, [StringLiteral([SplitContainer.from_single_token("Hi")])]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['capitals'], 'logger', '.', pl['capital'], 'info', '(', '"', pl['capital'], 'hi', '"', ')', ';' ] self.assertEqual(expected, actual)
def test_to_repr_with_enonlycontents(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 2, PrepParam.COM_STR: 0, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={}) tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral([ NonEng(Word.from_("ich")), NonEng(Word.from_("weiss")), NonEng(Word.from_("nicht")), NonEng(Word.from_("was")), NonEng(Word.from_("soll")), NonEng(Word.from_("es")), NonEng(Word.from_("bedeuten")), NonEng(Word.from_("dass")), NonEng(Word.from_("ich")), NonEng(Word.from_("so")), NonEng(Word.from_("traurig")), NonEng(Word.from_("bin")), ]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment([ SplitContainer( [NonEng(Word.from_("DIESELBE")), Word.from_("8")]) ]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng_content"], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_simple_log(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('info'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('info'), INFO, [StringLiteral([SplitContainer.from_single_token("Hi")])]), Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] self.assertEqual(expected, actual)
def test_tabs_and_newlines_before_semicolon(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('d'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', NewLine(), NewLine(), Tab(), Tab(), ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('d'), DEBUG, [StringLiteral([SplitContainer.from_single_token("Hi")])], [NewLine(), NewLine(), Tab(), Tab()]), Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] self.assertEqual(expected, actual)
def test_2_logs(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('t'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', NewLine(), SplitContainer.from_single_token('Logger'), '.', SplitContainer.from_single_token('SEVERE'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', ] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('t'), TRACE, [StringLiteral([SplitContainer.from_single_token("Hi")])]), NewLine(), LogStatement(SplitContainer.from_single_token('Logger'), SplitContainer.from_single_token('SEVERE'), FATAL, [StringLiteral([SplitContainer.from_single_token("Hi")])])] self.assertEqual(expected, actual)
def test_no_dot(self): input = [NewLine(), SplitContainer.from_single_token('log'), SplitContainer.from_single_token('infooooo'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_4(self): text = ''' BigAWESOMEString[] a2y = "abc".doSplit("\\""); ''' expected_result = [ NewLine(), SplitContainer([ Word.from_('Big'), Word.from_('AWESOME'), Word.from_('String') ]), '[', ']', SplitContainer([Word.from_('a'), Word.from_('2'), Word.from_('y')]), '=', StringLiteral([SplitContainer.from_single_token('abc')]), '.', SplitContainer([Word.from_('do'), Word.from_('Split')]), '(', StringLiteral([Backslash(), Quote()]), ')', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_nested_data_class(self): input = [ '{', '}', MultilineComment([SplitContainer.from_single_token("class")]), SplitContainer.from_single_token('import'), SplitContainer.from_single_token("a"), NewLine(), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('A'), '{', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print1'), '(', ')', '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}', SplitContainer.from_single_token('static'), SplitContainer.from_single_token('private'), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('B'), SplitContainer.from_single_token('extends'), SplitContainer.from_single_token('D'), '{', SplitContainer.from_single_token('private'), SplitContainer.from_single_token('String'), SplitContainer.from_single_token('b'), ';', SplitContainer.from_single_token('B'), '(', ')', '{', '}', SplitContainer.from_single_token('static'), '{', SplitContainer.from_single_token('c'), '=', StringLiteral([SplitContainer.from_single_token('class')]), '.', SplitContainer.from_single_token('class'), '}', '}', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print'), '(', ')', '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}', SplitContainer.from_single_token('int'), SplitContainer.from_single_token('a'), ';', '}', ] actual = loggable.mark(input, None) expected = [ '{', '}', MultilineComment([SplitContainer.from_single_token("class")]), SplitContainer.from_single_token('import'), SplitContainer.from_single_token("a"), NewLine(), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('A'), '{', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print1'), '(', ')', LoggableBlock([ '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}' ]), SplitContainer.from_single_token('static'), SplitContainer.from_single_token('private'), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('B'), SplitContainer.from_single_token('extends'), SplitContainer.from_single_token('D'), '{', SplitContainer.from_single_token('private'), SplitContainer.from_single_token('String'), SplitContainer.from_single_token('b'), ';', SplitContainer.from_single_token('B'), '(', ')', LoggableBlock(['{', '}']), SplitContainer.from_single_token('static'), LoggableBlock([ '{', SplitContainer.from_single_token('c'), '=', StringLiteral([SplitContainer.from_single_token('class')]), '.', SplitContainer.from_single_token('class'), '}' ]), '}', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print'), '(', ')', LoggableBlock([ '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}' ]), SplitContainer.from_single_token('int'), SplitContainer.from_single_token('a'), ';', '}' ] self.assertEqual(expected, actual)
def test_process_comments_and_str_literals(self): ''' Positive scenario <start>"//test_MyClass" //*/ "/*!" /* /* <end> ''' tokens = [ Quote(), OneLineCommentStart(), SplitContainer([ Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("Class") ]), Quote(), NewLine(), OneLineCommentStart(), MultilineCommentEnd(), NewLine(), Quote(), MultilineCommentStart(), SplitContainer.from_single_token("!"), Quote(), NewLine(), MultilineCommentStart(), NewLine(), MultilineCommentEnd(), NewLine(), ] actual = process_comments_and_str_literals(tokens, {}) expected = [ StringLiteral([ OneLineCommentStart(), SplitContainer([ Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("Class") ], ) ]), NewLine(), OneLineComment([MultilineCommentEnd()]), NewLine(), StringLiteral([ MultilineCommentStart(), SplitContainer.from_single_token("!") ]), NewLine(), MultilineComment([NewLine()]), NewLine() ] self.assertEqual(expected, actual)
from logrec.dataprep.model.logging import INFO, LogStatement, LoggableBlock from logrec.dataprep.model.noneng import NonEng from logrec.dataprep.model.numeric import DecimalPoint, Number from logrec.dataprep.model.placeholders import placeholders from logrec.dataprep.model.word import Word, Underscore from logrec.dataprep.prepconfig import PrepParam, PrepConfig from logrec.dataprep.split.ngram import NgramSplittingType, NgramSplitConfig from logrec.dataprep.to_repr import to_repr pl = placeholders tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral( [SplitContainer([Word.from_("A"), NonEng(Word.from_("Wirklich"))])]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment( [SplitContainer([NonEng(Word.from_("DIESELBE")), Word.from_("8")])])
[placeholders['non_eng']] ), "_сегодня": ( [SplitContainer([Underscore(), (NonEng(Word.from_("сегодня")))])], [placeholders['word_start'], '_', placeholders['non_eng'], placeholders['word_end']] ), "_Сегодня": ( [SplitContainer([Underscore(), (NonEng(Word.from_("Сегодня")))])], [placeholders['word_start'], '_', placeholders['capital'], placeholders['non_eng'], placeholders['word_end']] ), "Сегодня": ( [SplitContainer([(NonEng(Word.from_("Сегодня")))])], [placeholders['capital'], placeholders['non_eng']] ), '"сегодня"': ( [StringLiteral([SplitContainer([(NonEng(Word.from_("сегодня")))])])], ['"', placeholders['non_eng'], '"'] ), 'logger.info("Установлена licht4bild пользователем" + user.getNick()) ;': ( [LogStatement(SplitContainer.from_single_token('logger'), SplitContainer.from_single_token('info'), INFO, [StringLiteral([ SplitContainer([NonEng(Word.from_('Установлена'))]), SplitContainer([ NonEng(Word.from_('licht')), Word.from_('4'), NonEng(Word.from_('bild')) ]), SplitContainer([NonEng(Word.from_('пользователем'))]) ]), '+', SplitContainer.from_single_token('user'), '.', SplitContainer([ Word.from_('get'),