示例#1
0
def test_split_string():
    actual = split_string("123\nAb2cd34Ef000GG     j_89_J")

    expected = [
        Number('123'),
        NewLine(),
        SplitContainer([
            Word.from_('Ab'),
            Word.from_('2'),
            Word.from_('cd'),
            Word.from_('34'),
            Word.from_('Ef'),
            Word.from_('000'),
            Word.from_('GG')
        ]),
        SpaceInString(5),
        SplitContainer([
            Word.from_('j'),
            Underscore(),
            Word.from_('89'),
            Underscore(),
            Word.from_('J')
        ])
    ]

    assert expected == actual
示例#2
0
def test_multi_line_comment():
    text = '''
/*multi-line MyComment_
*//
_operations
'''

    expected_result = [
        MultilineComment([
            NonCodeChar('/'),
            NonCodeChar('*'),
            SplitContainer.from_single_token('multi'),
            NonCodeChar('-'),
            SplitContainer.from_single_token('line'),
            SplitContainer(
                [Word.from_('My'),
                 Word.from_('Comment'),
                 Underscore()]),
            NewLine(),
            NonCodeChar('*'),
            NonCodeChar('/')
        ]),
        Operator('/'),
        NewLine(),
        SplitContainer([Underscore(), Word.from_('operations')]),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#3
0
def test_floats():
    text = '''float[] floats = {-0.43E4f, .58F, 0.d, -9.63e+2D, 0.E-8};'''
    expected_result = [
        KeyWord('float'),
        Operator('['),
        Operator(']'),
        SplitContainer.from_single_token('floats'),
        Operator('='),
        OpeningCurlyBracket(),
        Operator('-'),
        Number("0.43E4f"),
        Operator(','),
        Number(".58F"),
        Operator(','),
        Number("0.d"),
        Operator(','),
        Operator('-'),
        Number('9.63e+2D'),
        Operator(','),
        Number('0.E-8'),
        ClosingCurlyBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#4
0
def test_longs():
    text = '''long[] lovely_longs = {0x34a35EL,     0x88bc96fl           , -0x34L};'''
    expected_result = [
        KeyWord('long'),
        Operator('['),
        Operator(']'),
        SplitContainer(
            [Word.from_('lovely'),
             Underscore(),
             Word.from_('longs')]),
        Operator('='),
        OpeningCurlyBracket(),
        Number("0x34a35EL"),
        Operator(','),
        Tab(),
        Number("0x88bc96fl"),
        Tab(),
        Tab(),
        Operator(','),
        Operator('-'),
        Number("0x34L"),
        ClosingCurlyBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#5
0
文件: noneng.py 项目: mir-am/codeprep
 def preprocessed_repr(
         self, repr_config: ReprConfig
 ) -> Tuple[List[str], PreprocessingMetadata]:
     if repr_config.bpe_data:
         token = replace_non_ascii_seqs(str(self.processable_token),
                                        placeholders['non_ascii_seq'])
         return torepr(SplitContainer.from_single_token(token), repr_config)
     else:
         return self.wrap_in_metadata_for_full_word(
             [placeholders['non_eng']])
示例#6
0
def test_string_with_spaces():
    text = '''"hi   dear     world    !"'''
    expected = [
        StringLiteral([
            NonCodeChar('"'),
            SplitContainer.from_single_token('hi'),
            SpaceInString(3),
            SplitContainer.from_single_token('dear'),
            SpaceInString(5),
            SplitContainer.from_single_token('world'),
            SpaceInString(4),
            NonCodeChar('!'),
            NonCodeChar('"'),
        ], 26),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected == actual
示例#7
0
def split_identifier(token: str) -> SplitContainer:
    parts = [
        m[0] for m in regex.finditer(
            '(_|[0-9]+|[[:upper:]]?[[:lower:]]+|[[:upper:]]+(?![[:lower:]])|[^ ])',
            token)
    ]

    processable_tokens = [
        Word.from_(p) if p != '_' else Underscore() for p in parts
    ]
    split_container = SplitContainer(processable_tokens)
    return NonEng(split_container) if is_non_eng(token) else split_container
示例#8
0
def test_spaces_in_strings():
    text = '''BigAWESOMEString[] a2y = "a    bc".doSplit("\\"");'''
    expected_result = [
        SplitContainer(
            [Word.from_('Big'),
             Word.from_('AWESOME'),
             Word.from_('String')], ),
        Operator('['),
        Operator(']'),
        SplitContainer([Word.from_('a'),
                        Word.from_('2'),
                        Word.from_('y')]),
        Operator('='),
        StringLiteral([
            NonCodeChar('"'),
            SplitContainer.from_single_token('a'),
            SpaceInString(n_chars=4),
            SplitContainer.from_single_token('bc'),
            NonCodeChar('"')
        ], 9),
        Operator('.'),
        SplitContainer([Word.from_('do'),
                        Word.from_('Split')]),
        OpeningBracket(),
        StringLiteral([
            NonCodeChar('"'),
            NonCodeChar('\\'),
            NonCodeChar('"'),
            NonCodeChar('"')
        ], 4),
        ClosingBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#9
0
def test_capitals():
    text = '''
MyClass Class CONSTANT VAR_WITH_UNDERSCORES
'''

    expected_result = [
        SplitContainer([Word.from_("My"),
                        Word.from_("Class")]),
        SplitContainer.from_single_token("Class"),
        SplitContainer.from_single_token("CONSTANT"),
        SplitContainer([
            Word.from_("VAR"),
            Underscore(),
            Word.from_("WITH"),
            Underscore(),
            Word.from_("UNDERSCORES")
        ]),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#10
0
def test_string_literal_double():
    text = '''a = "some_text".split()'''

    expected_result = [
        SplitContainer.from_single_token("a"),
        Operator('='),
        StringLiteral([NonCodeChar('"')], 1),
        StringLiteral([
            SplitContainer(
                [Word.from_("some"),
                 Underscore(),
                 Word.from_("text")])
        ], 9),
        StringLiteral([NonCodeChar('"')], 1),
        Operator('.'),
        SplitContainer.from_single_token("split"),
        OpeningBracket(),
        ClosingBracket(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'py')]

    assert expected_result == actual
示例#11
0
def test_1():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'u',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    tokens = [SplitContainer.from_single_token("Whi@le")]

    actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges_cache={'Whi@@le@': ['Whi@@le@']}))

    expected = ["Whi@le" + placeholders['compound_word_end']]

    expected_metadata = PreprocessingMetadata(word_boundaries=[0, 1], token_types=[SplitContainer])

    assert expected == actual
    assert expected_metadata == actual_metadata
示例#12
0
def test_merges_no_cache():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '4',
        PrepParam.TABS_NEWLINES: 's',
        PrepParam.CASE: 'u'
    })

    tokens = [SplitContainer.from_single_token("Whi@l@@e@")]

    actual, actual_metadata = to_repr(prep_config, tokens, BpeData(merges=MergeList().append(Merge(('W', 'h'), 10)),
                                                                    merges_cache={} ))

    expected = ["Wh", "i", '@', "l", '@', '@', "e", '@', pl["compound_word_end"]]

    expected_metadata = PreprocessingMetadata(word_boundaries=[0, 9], token_types=[SplitContainer])

    assert expected == actual
    assert expected_metadata == actual_metadata
示例#13
0
def test_ints():
    text = '''int[] _my_favoRite_ints_ = {0x12, 0x1fE, 441, -81, -0xfFf};'''

    expected_result = [
        KeyWord('int'),
        Operator('['),
        Operator(']'),
        SplitContainer([
            Underscore(),
            Word.from_('my'),
            Underscore(),
            Word.from_('favo'),
            Word.from_('Rite'),
            Underscore(),
            Word.from_('ints'),
            Underscore()
        ]),
        Operator('='),
        OpeningCurlyBracket(),
        Number("0x12"),
        Operator(','),
        Number("0x1fE"),
        Operator(','),
        Number("441"),
        Operator(','),
        Operator('-'),
        Number("81"),
        Operator(','),
        Operator('-'),
        Number("0xfFf"),
        ClosingCurlyBracket(),
        Semicolon(),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#14
0
def test_to_repr_with_enonlycontents1():
    prep_config = PrepConfig({
        PrepParam.EN_ONLY: 'U',
        PrepParam.COM: 'c',
        PrepParam.STR: '1',
        PrepParam.SPLIT: '2',
        PrepParam.TABS_NEWLINES: '0',
        PrepParam.CASE: 'l'
    })

    tokens = [
        Number("1.1"),
        Operator("*"),
        NonEng(SplitContainer([Word.from_("dinero")])),
        StringLiteral([
            NonCodeChar('"'),
            NonEng(SplitContainer([Word.from_("ich")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("weiss")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("nicht")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("was")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("soll")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("es")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("bedeuten")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("dass")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("ich")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("so")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("traurig")])),
            SpaceInString(),
            NonEng(SplitContainer([Word.from_("bin")])),
            NonCodeChar('"'),
        ], 62),
        NewLine(),
        MultilineComment([NonCodeChar('/'), NonCodeChar('*')]),
        MultilineComment([
            NonEng(SplitContainer([Word.from_('ц')])),
            NonEng(
                SplitContainer([
                    Word.from_("blanco"),
                    Underscore(),
                    Word.from_("english")
                ])
            ),
        ]),
        MultilineComment([NonCodeChar('*'), NonCodeChar('/')]),
        NewLine(), Tab(),
        OneLineComment([NonCodeChar('/'), NonCodeChar('/'),
            NonEng(
                SplitContainer([
                    Word.from_("DIESELBE"),
                    Word.from_("8")
                ])
            )
        ])
    ]

    actual, actual_metadata = to_repr(prep_config, tokens)

    expected = [
        pl['word_start'],
        '1',
        '.',
        '1',
        pl['word_end'],
        "*",
        pl['non_eng'],
        '"', pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"],
        pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], pl["non_eng"], '"',
        '/', '*', pl['non_eng'], pl['non_eng'],
        '*', '/',
        '/', '/',  pl['non_eng'],
        pl['olc_end']
    ]

    expected_metadata = PreprocessingMetadata({'*', '"', "/", "*"},
                                              word_boundaries=[0] + list(range(5, 32)),
                                              token_types=[Number, Operator, NonEng]
                                                          + [StringLiteral] * 14
                                                          + [MultilineComment] * 6
                                                          + [OneLineComment] * 4)

    assert expected == actual
    assert expected_metadata == actual_metadata
示例#15
0
from codeprep.preprocess.metadata import PreprocessingMetadata
from codeprep.tokens.noneng import NonEng
from codeprep.tokens.numeric import Number
from codeprep.preprocess.placeholders import placeholders
from codeprep.tokens.whitespace import Tab, NewLine, SpaceInString
from codeprep.tokens.word import Word, Underscore, NonCodeChar, Operator
from codeprep.prepconfig import PrepParam, PrepConfig
from codeprep.pipeline.to_repr import to_repr

pl = placeholders
cwe = placeholders['compound_word_end']

tokens = [
    Number('1.1'),
    Operator("*"),
    NonEng(SplitContainer([Word.from_("übersetzen")])),
    StringLiteral([
        NonCodeChar('"'),
        NonEng(
            SplitContainer([
                Word.from_("A"),
                Word.from_("Wirklicä")
            ])
        ),
        SpaceInString(1),
        NonCodeChar('"')
    ], 11),
    NewLine(),
    MultilineComment([NonCodeChar('/'), NonCodeChar('*')]),
    MultilineComment([
        NonEng(
示例#16
0
def test_one_line_comment():
    text = '''// this code won't compile but the preprocessing still has to be done corrrectly'''

    expected_result = [
        OneLineComment([
            NonCodeChar('/'),
            NonCodeChar('/'),
            SplitContainer.from_single_token('this'),
            SplitContainer.from_single_token('code'),
            SplitContainer.from_single_token('won'),
            NonCodeChar("'"),
            SplitContainer.from_single_token('t'),
            SplitContainer.from_single_token('compile'),
            SplitContainer.from_single_token('but'),
            SplitContainer.from_single_token('the'),
            SplitContainer.from_single_token('preprocessing'),
            SplitContainer.from_single_token('still'),
            SplitContainer.from_single_token('has'),
            SplitContainer.from_single_token('to'),
            SplitContainer.from_single_token('be'),
            SplitContainer.from_single_token('done'),
            SplitContainer.from_single_token('corrrectly'),
            NewLine()
        ])
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual
示例#17
0
def test_special_characters():
    text = '''
abc1
~-0xFFFFFL=
.0E+5
|=
?
==
!=
**
++
--
+=
-=
/=
*=
%=
$
<=
>=
@
    ^=
    &=
    #
                                                                                 >>
<<
&&
||
+*!/><\t\n
{}[],.-:();&|\\'~%^
'''

    expected_result = [
        SplitContainer([Word.from_('abc'), Word.from_('1')]),
        NewLine(),
        Operator('~'),
        Operator('-'),
        Number("0xFFFFFL"),
        Operator('='),
        NewLine(),
        Number(".0E+5"),
        NewLine(),
        Operator('|'),
        Operator('='),
        NewLine(),
        Operator('?'),
        NewLine(),
        Operator('='),
        Operator('='),
        NewLine(),
        Operator('!'),
        Operator('='),
        NewLine(),
        Operator('*'),
        Operator('*'),
        NewLine(),
        Operator('+'),
        Operator('+'),
        NewLine(),
        Operator('-'),
        Operator('-'),
        NewLine(),
        Operator('+'),
        Operator('='),
        NewLine(),
        Operator('-'),
        Operator('='),
        NewLine(),
        Operator('/'),
        Operator('='),
        NewLine(),
        Operator('*'),
        Operator('='),
        NewLine(),
        Operator('%'),
        Operator('='),
        NewLine(),
        NonCodeChar('$'),
        NewLine(),
        Operator('<'),
        Operator('='),
        NewLine(),
        Operator('>'),
        Operator('='),
        NewLine(),
        NonCodeChar('@'),
        NewLine(),
        Tab(),
        Operator('^'),
        Operator('='),
        NewLine(),
        Tab(),
        Operator('&'),
        Operator('='),
        NewLine(),
        Tab(),
        NonCodeChar('#'),
        NewLine(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Tab(),
        Operator('>'),
        Operator('>'),
        NewLine(),
        Operator('<'),
        Operator('<'),
        NewLine(),
        Operator('&'),
        Operator('&'),
        NewLine(),
        Operator('|'),
        Operator('|'),
        NewLine(),
        Operator('+'),
        Operator('*'),
        Operator('!'),
        Operator('/'),
        Operator('>'),
        Operator('<'),
        Tab(),
        NewLine(),
        NewLine(),
        OpeningCurlyBracket(),
        ClosingCurlyBracket(),
        Operator('['),
        Operator(']'),
        Operator(','),
        Operator('.'),
        Operator('-'),
        Operator(':'),
        OpeningBracket(),
        ClosingBracket(),
        Semicolon(),
        Operator('&'),
        Operator('|'),
        NonCodeChar('\\'),
        NonCodeChar("'"),
        Operator('~'),
        Operator('%'),
        Operator('^'),
        NewLine()
    ]

    actual = [t for t in convert_text(text, 'java')]

    assert expected_result == actual