Пример #1
0
    def test_2(self):
        text = '''
int[] _my_favoRite_ints_ = {0x12, 0x1fE, 441, -81, -0xfFf};
'''
        expected_result = [
            NewLine(),
            SplitContainer.from_single_token('int'), '[', ']',
            SplitContainer([
                Underscore(),
                Word.from_('my'),
                Underscore(),
                NonEng(Word.from_('favo')),
                Word.from_('Rite'),
                Underscore(),
                Word.from_('ints'),
                Underscore()
            ]), '=', '{',
            Number([HexStart(), '1', '2']), ',',
            Number([HexStart(), '1', 'f', 'E']), ',',
            Number(['4', '4', '1']), ',',
            Number(['-', '8', '1']), ',',
            Number(['-', HexStart(), 'f', 'F', 'f']), '}', ';',
            NewLine(),
            NewLine()
        ]

        self.__test_apply_preprocessors(text, expected_result)
Пример #2
0
    def test_7(self):
        text = '''
/*multi-line MyComment_
*//
_operations
'''

        expected_result = [
            NewLine(),
            MultilineComment([
                SplitContainer.from_single_token('multi'), '-',
                SplitContainer.from_single_token('line'),
                SplitContainer(
                    [Word.from_('My'),
                     Word.from_('Comment'),
                     Underscore()]),
                NewLine()
            ]), '/',
            NewLine(),
            SplitContainer([Underscore(),
                            Word.from_('operations')]),
            NewLine(),
            NewLine()
        ]

        self.__test_apply_preprocessors(text, expected_result)
Пример #3
0
    def test_with_numbers_split(self):
        token = [StringLiteral([":", ParseableToken("_test_my123GmyClass_")])]
        actual = simple_split(token, {})

        expected = [StringLiteral([":", SplitContainer([
            Underscore(),
            Word.from_("test"),
            Underscore(),
            Word.from_("my"),
            Word.from_("123"),
            Word.from_("Gmy"),
            Word.from_("Class"),
            Underscore()
        ])])]
        self.assertEqual(actual, expected)
Пример #4
0
    def test_mark_with_noneng(self):
        tokens = [
            StringLiteral(
                [SplitContainer([Word.from_("A"),
                                 Word.from_("Wirklich")])]),
            MultilineComment([
                SplitContainer.from_single_token('ц'),
                SplitContainer([
                    Word.from_("blanco"),
                    Underscore(),
                    Word.from_("english")
                ])
            ]),
            OneLineComment(
                [SplitContainer([Word.from_("DIESELBE"),
                                 Word.from_("8")])])
        ]

        actual = mark(tokens, {})

        expected = [
            StringLiteral([
                SplitContainer(
                    [Word.from_("A"),
                     NonEng(Word.from_("Wirklich"))])
            ]),
            MultilineComment([
                SplitContainer([NonEng(Word.from_('ц'))]),
                SplitContainer([
                    # we have to call constructor manually here,
                    # case split container cannot set wordStart prefix
                    # when the first subword is wrapped in NonEng
                    NonEng(Word.from_("blanco")),
                    Underscore(),
                    Word.from_("english")
                ])
            ]),
            OneLineComment([
                SplitContainer([
                    # we have to call constructor manually here,
                    # case split container cannot set wordStart prefix
                    # when the first subword is wrapped in NonEng
                    NonEng(Word.from_("DIESELBE")),
                    Word.from_("8")
                ])
            ])
        ]
        self.assertEqual(expected, actual)
Пример #5
0
    def test_to_repr_with_enonlycontents(self):
        prep_config = PrepConfig({
            PrepParam.EN_ONLY: 2,
            PrepParam.COM_STR: 0,
            PrepParam.SPLIT: 3,
            PrepParam.TABS_NEWLINES: 1,
            PrepParam.MARK_LOGS: 1,
            PrepParam.CAPS: 1
        })

        ngramSplittingConfig = NgramSplitConfig(
            splitting_type=NgramSplittingType.NUMBERS_AND_CUSTOM,
            sc_splittings={})

        tokens = [
            Number([1, DecimalPoint(), 1]), "*",
            SplitContainer([NonEng(Word.from_("dinero"))]),
            StringLiteral([
                NonEng(Word.from_("ich")),
                NonEng(Word.from_("weiss")),
                NonEng(Word.from_("nicht")),
                NonEng(Word.from_("was")),
                NonEng(Word.from_("soll")),
                NonEng(Word.from_("es")),
                NonEng(Word.from_("bedeuten")),
                NonEng(Word.from_("dass")),
                NonEng(Word.from_("ich")),
                NonEng(Word.from_("so")),
                NonEng(Word.from_("traurig")),
                NonEng(Word.from_("bin")),
            ]),
            NewLine(),
            MultilineComment([
                SplitContainer([NonEng(Word.from_('ц'))]),
                SplitContainer([
                    NonEng(Word.from_("blanco")),
                    Underscore(),
                    Word.from_("english")
                ])
            ]),
            NewLine(),
            Tab(),
            OneLineComment([
                SplitContainer(
                    [NonEng(Word.from_("DIESELBE")),
                     Word.from_("8")])
            ])
        ]

        actual = to_repr(prep_config, tokens, ngramSplittingConfig)

        expected = [
            pl['word_start'], '1', '.', '1', pl['word_end'], "*",
            pl['non_eng'], '"', pl["non_eng_content"], '"', '/*',
            pl['non_eng'], pl['word_start'], pl['non_eng'], '_', 'english',
            pl['word_end'], '*/', '//', pl['word_start'], pl['capitals'],
            pl['non_eng'], "8", pl['word_end'], pl['olc_end']
        ]

        self.assertEqual(expected, actual)
Пример #6
0
    def test_capitals(self):
        text = '''
MyClass Class CONSTANT VAR_WITH_UNDERSCORES
'''

        expected_result = [
            NewLine(),
            SplitContainer([Word.from_("My"),
                            Word.from_("Class")]),
            SplitContainer.from_single_token("Class"),
            SplitContainer.from_single_token("CONSTANT"),
            SplitContainer([
                Word.from_("VAR"),
                Underscore(),
                Word.from_("WITH"),
                Underscore(),
                Word.from_("UNDERSCORES")
            ]),
            NewLine(),
            NewLine()
        ]

        self.__test_apply_preprocessors(text, expected_result)
Пример #7
0
    def test_no_logs(self):
        input = [NewLine(),
                 SplitContainer.from_single_token('long'),
                 '[',
                 ']',
                 SplitContainer([Word.from_('lovely'), Underscore(), Word.from_('longs')]),
                 '=',
                 '{',
                 Number([HexStart(), '3', '4', 'a', '3', '5', 'E', L
                 ()])]

        actual = logs.mark(input, None)

        self.assertEqual(input, actual)
Пример #8
0
def simple_split_token(token):
    if isinstance(token, ParseableToken):
        parts = [
            m[0] for m in regex.finditer(
                '(_|[0-9]+|[[:upper:]]?[[:lower:]]+|[[:upper:]]+(?![[:lower:]]))',
                str(token))
        ]
        # if len("".join(parts)) ==
        processable_tokens = [
            Word.from_(p) if p != '_' else Underscore() for p in parts
        ]
        return SplitContainer(processable_tokens)
    elif isinstance(token, ProcessableTokenContainer):
        return type(token)([
            simple_split_token(subtoken) for subtoken in token.get_subtokens()
        ])
    else:
        return token
Пример #9
0
    def test_1(self):
        text = '''
long[] lovely_longs = {0x34a35EL,     0x88bc96fl           , -0x34L};
'''
        expected_result = [
            NewLine(),
            SplitContainer.from_single_token('long'), '[', ']',
            SplitContainer(
                [Word.from_('lovely'),
                 Underscore(),
                 Word.from_('longs')]), '=', '{',
            Number([HexStart(), '3', '4', 'a', '3', '5', 'E',
                    L()]), ',',
            Tab(),
            Number([HexStart(), '8', '8', 'b', 'c', '9', '6', 'f',
                    L()]),
            Tab(),
            Tab(), ',',
            Number(['-', HexStart(), '3', '4', L()]), '}', ';',
            NewLine(),
            NewLine()
        ]

        self.__test_apply_preprocessors(text, expected_result)
Пример #10
0
    def test_process_comments_and_str_literals(self):
        '''
        Positive scenario

        <start>"//test_MyClass"
        //*/
        "/*!"
        /*
        /*
        <end>


        '''
        tokens = [
            Quote(),
            OneLineCommentStart(),
            SplitContainer([
                Word.from_("test"),
                Underscore(),
                Word.from_("my"),
                Word.from_("Class")
            ]),
            Quote(),
            NewLine(),
            OneLineCommentStart(),
            MultilineCommentEnd(),
            NewLine(),
            Quote(),
            MultilineCommentStart(),
            SplitContainer.from_single_token("!"),
            Quote(),
            NewLine(),
            MultilineCommentStart(),
            NewLine(),
            MultilineCommentEnd(),
            NewLine(),
        ]

        actual = process_comments_and_str_literals(tokens, {})

        expected = [
            StringLiteral([
                OneLineCommentStart(),
                SplitContainer([
                    Word.from_("test"),
                    Underscore(),
                    Word.from_("my"),
                    Word.from_("Class")
                ], )
            ]),
            NewLine(),
            OneLineComment([MultilineCommentEnd()]),
            NewLine(),
            StringLiteral([
                MultilineCommentStart(),
                SplitContainer.from_single_token("!")
            ]),
            NewLine(),
            MultilineComment([NewLine()]),
            NewLine()
        ]

        self.assertEqual(expected, actual)
Пример #11
0
from logrec.dataprep.to_repr import to_repr

pl = placeholders

tokens = [
    Number([1, DecimalPoint(), 1]), "*",
    SplitContainer([NonEng(Word.from_("dinero"))]),
    StringLiteral(
        [SplitContainer([Word.from_("A"),
                         NonEng(Word.from_("Wirklich"))])]),
    NewLine(),
    MultilineComment([
        SplitContainer([NonEng(Word.from_('ц'))]),
        SplitContainer([
            NonEng(Word.from_("blanco")),
            Underscore(),
            Word.from_("english")
        ])
    ]),
    NewLine(),
    Tab(),
    OneLineComment(
        [SplitContainer([NonEng(Word.from_("DIESELBE")),
                         Word.from_("8")])])
]


class TeprTest(unittest.TestCase):
    def test_both_enonly_and_nosplit(self):
        with self.assertRaises(ValueError):
            prep_config = PrepConfig({
     [placeholders["capital"], "vector"],
 ),
 "players": (
     [SplitContainer.from_single_token("players")],
     [placeholders["word_start"], 'play', 'er', 's', placeholders["word_end"]]
 ),
 "0.345e+4": (
     [Number(["0", DecimalPoint(), "3", "4", "5", E(), "+", "4"])],
     [placeholders["word_start"], "0.", "3", "4", "5", "e+", "4", placeholders["word_end"]]
 ),
 "bestPlayers": (
     [SplitContainer([Word.from_("best"), Word.from_("Players")])],
     [placeholders["word_start"], "best", placeholders["capital"], 'play', "er", "s", placeholders["word_end"]]
 ),
 "test_BestPlayers": (
     [SplitContainer([Word.from_("test"), Underscore(), Word.from_("Best"), Word.from_("Players")])],
     [placeholders["word_start"], "test", '_', placeholders["capital"],
      "best", placeholders["capital"], 'play', "er", "s", placeholders["word_end"]]
 ),
 "test_BestPlayers_modified": (
     [SplitContainer(
         [Word.from_("test"), Underscore(), Word.from_("Best"), Word.from_("Players"), Underscore(),
          Word.from_("modified")]
     )],
     [placeholders["word_start"], "test", '_', placeholders["capital"],
      "best", placeholders["capital"], 'play', "er", "s", '_', "mod",
      "if", "ied",
      placeholders["word_end"]]
 ),
 "N_PLAYERS_NUM": (
     [SplitContainer([Word.from_("N"), Underscore(), Word.from_("PLAYERS"), Underscore(), Word.from_("NUM")])],