示例#1
0
def test_multi_line_comment():
    text = '''
/*multi-line MyComment_
*//
_operations
'''

    expected_result = [
        MultilineComment([
            NonCodeChar('/'),
            NonCodeChar('*'),
            SplitContainer.from_single_token('multi'),
            NonCodeChar('-'),
            SplitContainer.from_single_token('line'),
            SplitContainer(
                [Word.from_('My'),
                 Word.from_('Comment'),
                 Underscore()]),
            NewLine(),
            NonCodeChar('*'),
            NonCodeChar('/')
        ]),
        Operator('/'),
        NewLine(),
        SplitContainer([Underscore(), Word.from_('operations')]),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#2
0
def test_floats():
    text = '''float[] floats = {-0.43E4f, .58F, 0.d, -9.63e+2D, 0.E-8};'''
    expected_result = [
        KeyWord('float'),
        Operator('['),
        Operator(']'),
        SplitContainer.from_single_token('floats'),
        Operator('='),
        OpeningCurlyBracket(),
        Operator('-'),
        Number("0.43E4f"),
        Operator(','),
        Number(".58F"),
        Operator(','),
        Number("0.d"),
        Operator(','),
        Operator('-'),
        Number('9.63e+2D'),
        Operator(','),
        Number('0.E-8'),
        ClosingCurlyBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#3
0
文件: noneng.py 项目: mir-am/codeprep
 def preprocessed_repr(
         self, repr_config: ReprConfig
 ) -> Tuple[List[str], PreprocessingMetadata]:
     if repr_config.bpe_data:
         token = replace_non_ascii_seqs(str(self.processable_token),
                                        placeholders['non_ascii_seq'])
         return torepr(SplitContainer.from_single_token(token), repr_config)
     else:
         return self.wrap_in_metadata_for_full_word(
             [placeholders['non_eng']])
示例#4
0
def test_string_with_spaces():
    text = '''"hi   dear     world    !"'''
    expected = [
        StringLiteral([
            NonCodeChar('"'),
            SplitContainer.from_single_token('hi'),
            SpaceInString(3),
            SplitContainer.from_single_token('dear'),
            SpaceInString(5),
            SplitContainer.from_single_token('world'),
            SpaceInString(4),
            NonCodeChar('!'),
            NonCodeChar('"'),
        ], 26),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected == actual
示例#5
0
def test_spaces_in_strings():
    text = '''BigAWESOMEString[] a2y = "a    bc".doSplit("\\"");'''
    expected_result = [
        SplitContainer(
            [Word.from_('Big'),
             Word.from_('AWESOME'),
             Word.from_('String')], ),
        Operator('['),
        Operator(']'),
        SplitContainer([Word.from_('a'),
                        Word.from_('2'),
                        Word.from_('y')]),
        Operator('='),
        StringLiteral([
            NonCodeChar('"'),
            SplitContainer.from_single_token('a'),
            SpaceInString(n_chars=4),
            SplitContainer.from_single_token('bc'),
            NonCodeChar('"')
        ], 9),
        Operator('.'),
        SplitContainer([Word.from_('do'),
                        Word.from_('Split')]),
        OpeningBracket(),
        StringLiteral([
            NonCodeChar('"'),
            NonCodeChar('\\'),
            NonCodeChar('"'),
            NonCodeChar('"')
        ], 4),
        ClosingBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#6
0
def test_capitals():
    text = '''
MyClass Class CONSTANT VAR_WITH_UNDERSCORES
'''

    expected_result = [
        SplitContainer([Word.from_("My"),
                        Word.from_("Class")]),
        SplitContainer.from_single_token("Class"),
        SplitContainer.from_single_token("CONSTANT"),
        SplitContainer([
            Word.from_("VAR"),
            Underscore(),
            Word.from_("WITH"),
            Underscore(),
            Word.from_("UNDERSCORES")
        ]),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#7
0
def test_string_literal_double():
    text = '''a = "some_text".split()'''

    expected_result = [
        SplitContainer.from_single_token("a"),
        Operator('='),
        StringLiteral([NonCodeChar('"')], 1),
        StringLiteral([
            SplitContainer(
                [Word.from_("some"),
                 Underscore(),
                 Word.from_("text")])
        ], 9),
        StringLiteral([NonCodeChar('"')], 1),
        Operator('.'),
        SplitContainer.from_single_token("split"),
        OpeningBracket(),
        ClosingBracket(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'py')]

    assert expected_result == actual
示例#8
0
def test_1():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    tokens = [SplitContainer.from_single_token("Whi@le")]

    actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges_cache={'Whi@@le@': ['Whi@@le@']}))

    expected = ["Whi@le" + placeholders['compound_word_end']]

    expected_metadata = PreprocessingMetadata(word_boundaries=[0, 1], token_types=[SplitContainer])

    assert expected == actual
    assert expected_metadata == actual_metadata
示例#9
0
def test_merges_no_cache():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    tokens = [SplitContainer.from_single_token("Whi@l@@e@")]

    actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges=MergeList().append(Merge(('W', 'h'), 10)),
                                                                    merges_cache={} ))

    expected = ["Wh", "i", '@', "l", '@', '@', "e", '@', pl["compound_word_end"]]

    expected_metadata = PreprocessingMetadata(word_boundaries=[0, 9], token_types=[SplitContainer])

    assert expected == actual
    assert expected_metadata == actual_metadata
示例#10
0
def test_one_line_comment():
    text = '''// this code won't compile but the preprocessing still has to be done corrrectly'''

    expected_result = [
        OneLineComment([
            NonCodeChar('/'),
            NonCodeChar('/'),
            SplitContainer.from_single_token('this'),
            SplitContainer.from_single_token('code'),
            SplitContainer.from_single_token('won'),
            NonCodeChar("'"),
            SplitContainer.from_single_token('t'),
            SplitContainer.from_single_token('compile'),
            SplitContainer.from_single_token('but'),
            SplitContainer.from_single_token('the'),
            SplitContainer.from_single_token('preprocessing'),
            SplitContainer.from_single_token('still'),
            SplitContainer.from_single_token('has'),
            SplitContainer.from_single_token('to'),
            SplitContainer.from_single_token('be'),
            SplitContainer.from_single_token('done'),
            SplitContainer.from_single_token('corrrectly'),
            NewLine()
        ])
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual