def test_7(self): text = ''' /*multi-line MyComment_ *// _operations ''' expected_result = [ NewLine(), MultilineComment([ SplitContainer.from_single_token('multi'), '-', SplitContainer.from_single_token('line'), SplitContainer( [Word.from_('My'), Word.from_('Comment'), Underscore()]), NewLine() ]), '/', NewLine(), SplitContainer([Underscore(), Word.from_('operations')]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_2(self): text = ''' int[] _my_favoRite_ints_ = {0x12, 0x1fE, 441, -81, -0xfFf}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('int'), '[', ']', SplitContainer([ Underscore(), Word.from_('my'), Underscore(), NonEng(Word.from_('favo')), Word.from_('Rite'), Underscore(), Word.from_('ints'), Underscore() ]), '=', '{', Number([HexStart(), '1', '2']), ',', Number([HexStart(), '1', 'f', 'E']), ',', Number(['4', '4', '1']), ',', Number(['-', '8', '1']), ',', Number(['-', HexStart(), 'f', 'F', 'f']), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_5(self): text = ''' // this code won't compile but the preprocessing still has to be done corrrectly ''' expected_result = [ NewLine(), OneLineComment([ SplitContainer.from_single_token('this'), SplitContainer.from_single_token('code'), SplitContainer.from_single_token('won'), "'", SplitContainer.from_single_token('t'), SplitContainer.from_single_token('compile'), SplitContainer.from_single_token('but'), SplitContainer.from_single_token('the'), SplitContainer.from_single_token('preprocessing'), SplitContainer.from_single_token('still'), SplitContainer.from_single_token('has'), SplitContainer.from_single_token('to'), SplitContainer.from_single_token('be'), SplitContainer.from_single_token('done'), SplitContainer.from_single_token('corrrectly') ]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_to_repr_with_enonlycontents(self): prep_config = PrepConfig({ PrepParam.EN_ONLY: 2, PrepParam.COM_STR: 0, PrepParam.SPLIT: 3, PrepParam.TABS_NEWLINES: 1, PrepParam.MARK_LOGS: 1, PrepParam.CAPS: 1 }) ngramSplittingConfig = NgramSplitConfig( splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM, sc_splittings={}) tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral([ NonEng(Word.from_("ich")), NonEng(Word.from_("weiss")), NonEng(Word.from_("nicht")), NonEng(Word.from_("was")), NonEng(Word.from_("soll")), NonEng(Word.from_("es")), NonEng(Word.from_("bedeuten")), NonEng(Word.from_("dass")), NonEng(Word.from_("ich")), NonEng(Word.from_("so")), NonEng(Word.from_("traurig")), NonEng(Word.from_("bin")), ]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment([ SplitContainer( [NonEng(Word.from_("DIESELBE")), Word.from_("8")]) ]) ] actual = to_repr(prep_config, tokens, ngramSplittingConfig) expected = [ pl['word_start'], '1', '.', '1', pl['word_end'], "*", pl['non_eng'], '"', pl["non_eng_content"], '"', '/*', pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english', pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'], pl['non_eng'], "8", pl['word_end'], pl['olc_end'] ] self.assertEqual(expected, actual)
def test_mark_all_eng(self): ''' All words are english. Nothing changed ''' tokens = [ StringLiteral([ OneLineCommentStart(), SplitContainer([ Word.from_("test"), Word.from_("my"), Word.from_("class") ]) ]), NewLine(), OneLineComment([ MultilineCommentEnd(), SplitContainer.from_single_token("lifeisgood") ]), NewLine(), StringLiteral([ MultilineCommentStart(), SplitContainer.from_single_token("!") ]), NewLine(), MultilineComment([NewLine()]), NewLine() ] actual = mark(tokens, {}) self.assertEqual(actual, tokens)
def test_process_comments_and_str_literals_newline_after_open_quote(self): tokens = [Quote(), NewLine()] actual = process_comments_and_str_literals(tokens, {}) expected = [Quote(), NewLine()] self.assertEqual(expected, actual)
def test_simple_log(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('info'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('info'), INFO, [StringLiteral([SplitContainer.from_single_token("Hi")])]), Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] self.assertEqual(expected, actual)
def test_2_logs(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('t'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', NewLine(), SplitContainer.from_single_token('Logger'), '.', SplitContainer.from_single_token('SEVERE'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', ] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('t'), TRACE, [StringLiteral([SplitContainer.from_single_token("Hi")])]), NewLine(), LogStatement(SplitContainer.from_single_token('Logger'), SplitContainer.from_single_token('SEVERE'), FATAL, [StringLiteral([SplitContainer.from_single_token("Hi")])])] self.assertEqual(expected, actual)
def test_no_dot(self): input = [NewLine(), SplitContainer.from_single_token('log'), SplitContainer.from_single_token('infooooo'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_no_logs(self): input = [NewLine(), SplitContainer.from_single_token('long'), '[', ']', SplitContainer([Word.from_('lovely'), Underscore(), Word.from_('longs')]), '=', '{', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L ()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_capitals(self): text = ''' MyClass Class CONSTANT VAR_WITH_UNDERSCORES ''' expected_result = [ NewLine(), SplitContainer([Word.from_("My"), Word.from_("Class")]), SplitContainer.from_single_token("Class"), SplitContainer.from_single_token("CONSTANT"), SplitContainer([ Word.from_("VAR"), Underscore(), Word.from_("WITH"), Underscore(), Word.from_("UNDERSCORES") ]), NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_4(self): text = ''' BigAWESOMEString[] a2y = "abc".doSplit("\\""); ''' expected_result = [ NewLine(), SplitContainer([ Word.from_('Big'), Word.from_('AWESOME'), Word.from_('String') ]), '[', ']', SplitContainer([Word.from_('a'), Word.from_('2'), Word.from_('y')]), '=', StringLiteral([SplitContainer.from_single_token('abc')]), '.', SplitContainer([Word.from_('do'), Word.from_('Split')]), '(', StringLiteral([Backslash(), Quote()]), ')', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_3(self): text = ''' float[] floats = {-0.43E4f, .58F, 0.d, -9.63e+2D, 0.E-8}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('float'), '[', ']', SplitContainer.from_single_token('floats'), '=', '{', Number(['-', '0', DecimalPoint(), '4', '3', E(), '4', F()]), ',', Number([DecimalPoint(), '5', '8', F()]), ',', Number(['0', DecimalPoint(), D()]), ',', Number(['-', '9', DecimalPoint(), '6', '3', E(), '+', '2', D()]), ',', Number(['0', DecimalPoint(), E(), '-', '8']), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_1(self): text = ''' long[] lovely_longs = {0x34a35EL, 0x88bc96fl , -0x34L}; ''' expected_result = [ NewLine(), SplitContainer.from_single_token('long'), '[', ']', SplitContainer( [Word.from_('lovely'), Underscore(), Word.from_('longs')]), '=', '{', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()]), ',', Tab(), Number([HexStart(), '8', '8', 'b', 'c', '9', '6', 'f', L()]), Tab(), Tab(), ',', Number(['-', HexStart(), '3', '4', L()]), '}', ';', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
def test_content_length_over_limit(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('info'), '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '(', '1', '*', '3', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')', ')' ')', ')', ')', ')', ')', ')', ')', ')' ')', ')', ')', ')', ')', ')', ')', ')' ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) self.assertEqual(input, actual)
def test_tabs_and_newlines_before_semicolon(self): input = [NewLine(), SplitContainer.from_single_token('log'), '.', SplitContainer.from_single_token('d'), '(', StringLiteral([SplitContainer.from_single_token("Hi")]), ')', NewLine(), NewLine(), Tab(), Tab(), ';', Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] actual = logs.mark(input, None) expected = [NewLine(), LogStatement(SplitContainer.from_single_token('log'), SplitContainer.from_single_token('d'), DEBUG, [StringLiteral([SplitContainer.from_single_token("Hi")])], [NewLine(), NewLine(), Tab(), Tab()]), Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L()])] self.assertEqual(expected, actual)
def test_nested_data_class(self): input = [ '{', '}', MultilineComment([SplitContainer.from_single_token("class")]), SplitContainer.from_single_token('import'), SplitContainer.from_single_token("a"), NewLine(), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('A'), '{', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print1'), '(', ')', '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}', SplitContainer.from_single_token('static'), SplitContainer.from_single_token('private'), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('B'), SplitContainer.from_single_token('extends'), SplitContainer.from_single_token('D'), '{', SplitContainer.from_single_token('private'), SplitContainer.from_single_token('String'), SplitContainer.from_single_token('b'), ';', SplitContainer.from_single_token('B'), '(', ')', '{', '}', SplitContainer.from_single_token('static'), '{', SplitContainer.from_single_token('c'), '=', StringLiteral([SplitContainer.from_single_token('class')]), '.', SplitContainer.from_single_token('class'), '}', '}', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print'), '(', ')', '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}', SplitContainer.from_single_token('int'), SplitContainer.from_single_token('a'), ';', '}', ] actual = loggable.mark(input, None) expected = [ '{', '}', MultilineComment([SplitContainer.from_single_token("class")]), SplitContainer.from_single_token('import'), SplitContainer.from_single_token("a"), NewLine(), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('A'), '{', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print1'), '(', ')', LoggableBlock([ '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}' ]), SplitContainer.from_single_token('static'), SplitContainer.from_single_token('private'), SplitContainer.from_single_token('class'), SplitContainer.from_single_token('B'), SplitContainer.from_single_token('extends'), SplitContainer.from_single_token('D'), '{', SplitContainer.from_single_token('private'), SplitContainer.from_single_token('String'), SplitContainer.from_single_token('b'), ';', SplitContainer.from_single_token('B'), '(', ')', LoggableBlock(['{', '}']), SplitContainer.from_single_token('static'), LoggableBlock([ '{', SplitContainer.from_single_token('c'), '=', StringLiteral([SplitContainer.from_single_token('class')]), '.', SplitContainer.from_single_token('class'), '}' ]), '}', SplitContainer.from_single_token('void'), SplitContainer.from_single_token('print'), '(', ')', LoggableBlock([ '{', SplitContainer.from_single_token('if'), '(', SplitContainer.from_single_token('True'), ')', '{', '}', '}' ]), SplitContainer.from_single_token('int'), SplitContainer.from_single_token('a'), ';', '}' ] self.assertEqual(expected, actual)
def test_6(self): text = ''' 9a abc1 ~-0xFFFFFL= .0E+5 |= ? == != ** ++ -- += -= /= *= %= $ <= >= @ ^= &= # >> << && || +*!/><\t\n {}[],.-:();&|\\'~%^ ''' expected_result = [ NewLine(), SplitContainer([Word.from_('9'), Word.from_('a')]), SplitContainer([Word.from_('abc'), Word.from_('1')]), NewLine(), '~', Number(['-', HexStart(), 'F', 'F', 'F', 'F', 'F', L()]), '=', NewLine(), Number([DecimalPoint(), '0', E(), '+', '5']), NewLine(), '|=', NewLine(), '?', NewLine(), '==', NewLine(), '!=', NewLine(), '**', NewLine(), '++', NewLine(), '--', NewLine(), '+=', NewLine(), '-=', NewLine(), '/=', NewLine(), '*=', NewLine(), '%=', NewLine(), '$', NewLine(), '<=', NewLine(), '>=', NewLine(), '@', NewLine(), Tab(), '^=', NewLine(), Tab(), '&=', NewLine(), Tab(), '#', NewLine(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), Tab(), '>>', NewLine(), '<<', NewLine(), '&&', NewLine(), '||', NewLine(), '+', '*', '!', '/', '>', '<', Tab(), NewLine(), NewLine(), '{', '}', '[', ']', ',', '.', '-', ':', '(', ')', ';', '&', '|', Backslash(), "'", '~', '%', '^', NewLine(), NewLine() ] self.__test_apply_preprocessors(text, expected_result)
from logrec.dataprep.model.chars import MultilineCommentStart, MultilineCommentEnd, OneLineCommentStart, \ NewLine, Backslash, Quote from logrec.dataprep.model.containers import MultilineComment, OneLineComment, StringLiteral, \ ProcessableTokenContainer from logrec.dataprep.model.numeric import Number, D, F, L, DecimalPoint, HexStart, E from logrec.dataprep.model.placeholders import placeholders from logrec.dataprep.model.word import ParseableToken logger = logging.getLogger(__name__) START_MULTILINE_COMMENT = MultilineCommentStart() END_MULTILINE_COMMENT = MultilineCommentEnd() START_ONE_LINE_COMMENT = OneLineCommentStart() NEW_LINE = NewLine() QUOTE = Quote() BACKSLASH = Backslash() tabs = ["\t" + str(i) for i in range(11)] multiline_comments_tokens = ["/*", "*/"] two_character_tokens = [ "==", "!=", "**", "//", "++", "--", "+=", "-=", "/=", "*=", "%=", "<=", ">=", "^=", "&=", "|=", ">>", "<<", "&&", "||" ] one_character_tokens = ["+", "*", "!", "/", ">", "<", "="]
from logrec.dataprep.model.numeric import DecimalPoint, Number from logrec.dataprep.model.placeholders import placeholders from logrec.dataprep.model.word import Word, Underscore from logrec.dataprep.prepconfig import PrepParam, PrepConfig from logrec.dataprep.split.ngram import NgramSplittingType, NgramSplitConfig from logrec.dataprep.to_repr import to_repr pl = placeholders tokens = [ Number([1, DecimalPoint(), 1]), "*", SplitContainer([NonEng(Word.from_("dinero"))]), StringLiteral( [SplitContainer([Word.from_("A"), NonEng(Word.from_("Wirklich"))])]), NewLine(), MultilineComment([ SplitContainer([NonEng(Word.from_('ц'))]), SplitContainer([ NonEng(Word.from_("blanco")), Underscore(), Word.from_("english") ]) ]), NewLine(), Tab(), OneLineComment( [SplitContainer([NonEng(Word.from_("DIESELBE")), Word.from_("8")])]) ]
def test_process_comments_and_str_literals(self): ''' Positive scenario <start>"//test_MyClass" //*/ "/*!" /* /* <end> ''' tokens = [ Quote(), OneLineCommentStart(), SplitContainer([ Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("Class") ]), Quote(), NewLine(), OneLineCommentStart(), MultilineCommentEnd(), NewLine(), Quote(), MultilineCommentStart(), SplitContainer.from_single_token("!"), Quote(), NewLine(), MultilineCommentStart(), NewLine(), MultilineCommentEnd(), NewLine(), ] actual = process_comments_and_str_literals(tokens, {}) expected = [ StringLiteral([ OneLineCommentStart(), SplitContainer([ Word.from_("test"), Underscore(), Word.from_("my"), Word.from_("Class") ], ) ]), NewLine(), OneLineComment([MultilineCommentEnd()]), NewLine(), StringLiteral([ MultilineCommentStart(), SplitContainer.from_single_token("!") ]), NewLine(), MultilineComment([NewLine()]), NewLine() ] self.assertEqual(expected, actual)
def from_file(lines): return [ w for line in lines for w in (ParseableToken( line if len(line) > 0 and line[-1] != '\n' else line[:-1]), NewLine()) ]