def apply_obfuscation(source): """ Returns 'source' all obfuscated. """ global keyword_args global imported_modules tokens = token_utils.listified_tokenizer(source) keyword_args = analyze.enumerate_keyword_args(tokens) imported_modules = analyze.enumerate_imports(tokens) variables = find_obfuscatables(tokens, obfuscatable_variable) classes = find_obfuscatables(tokens, obfuscatable_class) functions = find_obfuscatables(tokens, obfuscatable_function) variables = list(set(variables).difference(set(imported_modules))) for variable in variables: replace_obfuscatables(imported_modules, tokens, obfuscate_variable, variable, name_generator) # for function in functions: # replace_obfuscatables(imported_modules, # tokens, obfuscate_function, function, name_generator) # for _class in classes: # replace_obfuscatables(imported_modules,tokens, obfuscate_class, _class, name_generator) return token_utils.untokenize(tokens)
def minify(tokens, options): """ Performs minification on *tokens* according to the values in *options* """ # Remove comments #remove_comments(tokens) # Remove docstrings #remove_docstrings(tokens) result = token_utils.untokenize(tokens) # Minify our input script result = multiline_indicator.sub('', result) #result = fix_empty_methods(result) result = join_multiline_pairs(result) result = join_multiline_pairs(result, '[]') print(result) print( "\n---------------------------------------------------------------------\n" ) # result = join_multiline_pairs(result, '{}') # make code become one line result = remove_blank_lines(result) result = reduce_operators(result) #print (result) #result = dedent(result, use_tabs=options.tabs) return result
def transform_source(source, **kwargs): """A simple replacement of ``function`` by ``lambda``.""" tokens = token_utils.tokenize(source) for token in tokens: if token == "λ": token.string = "lambda" return token_utils.untokenize(tokens)
def transform_source(source, **kwargs): """Replace integers by Fraction objects""" tokens = token_utils.tokenize(source) for token in tokens: if token.is_integer(): token.string = f"Fraction({token.string})" return token_utils.untokenize(tokens)
def transform_source(source, **kwargs): """Simple transformation: replaces any single token λ by lambda. By defining this function, we can also make use of Ideas' console. """ tokens = token_utils.tokenize(source) for token in tokens: if token == "λ": token.string = "lambda" return token_utils.untokenize(tokens)
def transform_source(source, **kwargs): """Simple transformation: replaces any explicit float by a Decimal. By defining this function, we can also make use of Ideas' console. """ tokens = token_utils.tokenize(source) for token in tokens: if token.is_number() and "." in token.string: token.string = f"Decimal('{token.string}')" return token_utils.untokenize(tokens)
def automatic_self(source): """Replaces code like:: self .= : a b c = this if __ == that else ___ by:: self.a = a self.b = b self.c = this if c == that else c """ new_tokens = [] auto_self_block = False self_name = "" indentation = 0 get_nb = token_utils.get_number get_first = token_utils.get_first get_first_index = token_utils.get_first_index for tokens in token_utils.get_lines(source): if auto_self_block: variable = get_first(tokens) if variable is not None: # None would mean an empty line var_name = variable.string block_indent = variable.start_col if block_indent > indentation: dedent = block_indent - indentation if get_nb(tokens) == 1: variable.string = f"{self_name}.{var_name} = {var_name}" tokens = token_utils.dedent(tokens, dedent) else: variable.string = f"{self_name}.{var_name}" for token in tokens: if token.string == "__": token.string = var_name tokens = token_utils.dedent(tokens, dedent) else: auto_self_block = False elif get_nb(tokens) == 4: index = get_first_index(tokens) if (tokens[index].is_identifier() and tokens[index + 1] == "." and tokens[index + 2] == "=" and tokens[index + 1].end_col == tokens[index + 2].start_col and tokens[index + 3] == ":"): self_name = tokens[index].string indentation = tokens[index].start_col auto_self_block = True continue new_tokens.extend(tokens) return token_utils.untokenize(new_tokens)
def transform_source(source, **kwargs): """Simple transformation: replaces any explicit float followed by ``D`` by a Decimal. """ tokens = token_utils.tokenize(source) for first, second in zip(tokens, tokens[1:]): if first.is_number() and "." in first.string and second == "D": first.string = f"Decimal('{first.string}')" second.string = "" return token_utils.untokenize(tokens)
def french_to_english(source): """A simple replacement of 'French Python keyword' by their normal English version. """ new_tokens = [] for token in token_utils.tokenize(source): if token.string in fr_to_py: token.string = fr_to_py[token.string] new_tokens.append(token) new_source = token_utils.untokenize(new_tokens) return new_source
def random_deletion(sentence, n=1): tokens = tokenize(sentence) # obviously, if there's only one word, don't delete it if len(tokens) == 1: return tokens # randomly delete upto n words count = 0 while count < n: assert n < len(tokens) rand_index = random.randint(0, len(tokens) - 1) del tokens[rand_index] count += 1 return untokenize(tokens)
def function_as_a_keyword(source): """A simple replacement of ``function`` by ``lambda``. Note that, while the string ``lambda`` is shorter than ``function``, we do not adjust the information (start_col, end_col) about the position of the token. ``untokenize`` uses that information together with the information about each original line, to properly keep track of the spacing between tokens. """ new_tokens = [] for token in token_utils.tokenize(source): if token == "function": token.string = "lambda" new_tokens.append(token) return token_utils.untokenize(new_tokens)
def convert_repeat(source, predictable_names=False): """Replaces instances of:: repeat forever: -> while True: repeat while condition: -> while condition: repeat until condition: -> while not condition: repeat n: -> for _uid in range(n): A complete repeat statement is restricted to be on a single line ending with a colon (optionally followed by a comment). If the colon is missing, a ``RepeatSyntaxError`` is raised. """ new_tokens = [] if predictable_names: variable_name = utils.generate_predictable_names() else: variable_name = utils.generate_variable_names() for tokens in token_utils.get_lines(source): # a line of tokens can start with INDENT or DEDENT tokens ... first_token = token_utils.get_first(tokens) if first_token == "repeat": last_token = token_utils.get_last(tokens) if last_token != ":": raise RepeatSyntaxError( "Missing colon for repeat statement on line " + f"{first_token.start_row}\n {first_token.line}.") repeat_index = token_utils.get_first_index(tokens) second_token = tokens[repeat_index + 1] if second_token == "forever": first_token.string = "while" second_token.string = "True" elif second_token == "while": first_token.string = "while" second_token.string = "" elif second_token == "until": first_token.string = "while" second_token.string = "not" else: first_token.string = "for %s in range(" % next(variable_name) last_token.string = "):" new_tokens.extend(tokens) return token_utils.untokenize(new_tokens)
def pyminify(options, _file): module = os.path.split(_file)[1] module = ".".join(module.split('.')[:-1]) filesize = os.path.getsize(_file) source = open(_file, 'rb').read() tokens = token_utils.listified_tokenizer(source) # Perform obfuscation if any of the related options were set if options['obfuscate']: identifier_length = int(options['replacement_length']) name_generator = obfuscate.obfuscation_machine(identifier_length=identifier_length) obfuscate.obfuscate(module, tokens, options) result = token_utils.untokenize(tokens).strip() #result = filter(lambda x: x != '\r' and x != '\n', ' '.join(result.split())) print result
def replace(sentence, the_word, synonym): tokens = tokenize(sentence) # replace the_word with synonym try: assert the_word in tokens except AssertionError: print("AssertionError") print("sentence: {}\nthe world: {}\nsynonym: {}".format(sentence, the_word, synonym)) return None new_tokens = [synonym if word == the_word else word for word in tokens] new_sentence = untokenize(new_tokens) # print("--old: ", sentence) # print("replaced", the_word, "with", synonym) # print("--new: ", new_sentence) return new_sentence
def join_multiline_pairs(source, pair="()"): """ Finds and removes newlines in multiline matching pairs of characters in *source*. By default it joins parens () but it will join any two characters given via the *pair* variable. .. note:: Doesn't remove extraneous whitespace that ends up between the pair. Use `reduce_operators()` for that. Example:: test = ( "This is inside a multi-line pair of parentheses" ) Will become:: test = ( "This is inside a multi-line pair of parentheses" ) """ opener = pair[0] closer = pair[1] io_obj = io.StringIO(source) out_tokens = [] open_count = 0 for tok in tokenize.generate_tokens(io_obj.readline): token_type = tok[0] token_string = tok[1] if token_type == tokenize.OP and token_string in pair: if token_string == opener: open_count += 1 elif token_string == closer: open_count -= 1 out_tokens.append(tok) elif token_type in (tokenize.NL, tokenize.NEWLINE): if open_count == 0: out_tokens.append(tok) else: out_tokens.append(tok) return token_utils.untokenize(out_tokens)
def minify(tokens): """ Performs minification on *tokens* according to the values in *options* """ # Remove comments remove_comments(tokens) # Remove docstrings remove_docstrings(tokens) result = token_utils.untokenize(tokens) # Minify our input script result = multiline_indicator.sub('', result) result = fix_empty_methods(result) result = join_multiline_pairs(result) result = join_multiline_pairs(result, '[]') result = join_multiline_pairs(result, '{}') result = remove_blank_lines(result) result = reduce_operators(result) result = dedent(result) return result
def transform_source(source, **kwargs): """Does the following transformation:: with float_as_Decimal: a = 1.0 b = 2.0 c = 3.0 to:: if True: # with float_as_Decimal: a = Decimal('1.0') b = Decimal('2.0') c = 3.0 """ new_tokens = [] decimal_block = False for line in token_utils.get_lines(source): first = token_utils.get_first(line) if first is None: new_tokens.extend(line) continue elif first == "with" : first_index = token_utils.get_first_index(line) if len(line) > first_index + 1: second = line[first_index + 1] if second == "float_as_Decimal": first.string = "if" second.string = "True" indentation = first.start_col decimal_block = True elif decimal_block and first.start_col > indentation: for token in line: if token.is_number() and "." in token.string: token.string = f"Decimal('{token.string}')" else: indentation = first.start_col new_tokens.extend(line) return token_utils.untokenize(new_tokens)
def minify(tokens, options): """ Performs minification on *tokens* according to the values in *options* """ # Remove comments remove_comments(tokens) # Remove docstrings remove_docstrings(tokens) result = token_utils.untokenize(tokens) # Minify our input script result = multiline_indicator.sub('', result) result = fix_empty_methods(result) result = join_multiline_pairs(result) result = join_multiline_pairs(result, '[]') result = join_multiline_pairs(result, '{}') result = remove_blank_lines(result) result = reduce_operators(result) result = dedent(result, use_tabs=options.tabs) return result
def nobreak_as_a_keyword(source): """``nobreak`` is replaced by ``else`` only if it is the first non-space token on a line and if its indentation matches that of a ``for`` or ``while`` block. """ indentations = {} lines = token_utils.get_lines(source) new_tokens = [] for line in lines: first = token_utils.get_first(line) if first is None: new_tokens.extend(line) continue if first == "nobreak": if first.start_col in indentations: if indentations[first.start_col] in ["for", "while"]: first.string = "else" indentations[first.start_col] = first.string new_tokens.extend(line) return token_utils.untokenize(new_tokens)
def add_multiplication_symbol(source): """This adds a multiplication symbol where it would be understood as being implicit by the normal way algebraic equations are written but would be a SyntaxError in Python. Thus we have:: 2n -> 2*n n 2 -> n* 2 2(a+b) -> 2*(a+b) (a+b)2 -> (a+b)*2 2 3 -> 2* 3 m n -> m* n (a+b)c -> (a+b)*c The obvious one (in algebra) being left out is something like ``n(...)`` which is a function call - and thus valid Python syntax. """ tokens = token_utils.tokenize(source) if not tokens: return tokens prev_token = tokens[0] new_tokens = [prev_token] for token in tokens[1:]: # The code has been written in a way to demonstrate that this type of # transformation could be done as the source is tokenized by Python. if ((prev_token.is_number() and (token.is_identifier() or token.is_number() or token == "(")) or (prev_token.is_identifier() and (token.is_identifier() or token.is_number())) or (prev_token == ")" and (token.is_identifier() or token.is_number()))): new_tokens.append("*") new_tokens.append(token) prev_token = token return token_utils.untokenize(new_tokens)
def random_swap(sentence, distance=1): """ randomly swap words in a sentence :params[in]: sentence, a string, input sentence :params[in]: distance, integer, distance of words :params[out]: n_sentence, a string, new sentence """ # lis = sent.split(' ') # split by spaces tokens = tokenize(sentence) tokens_length = len(tokens) assert tokens_length >= 2 index1 = random.randint(0, tokens_length - 1) # canidates pool candidates = set(range(index1 - distance, index1 + distance + 1)) & set(range(tokens_length)) candidates.remove(index1) # randomly sample another index index2 = random.sample(candidates, 1)[0] # swap two elements tokens[index1], tokens[index2] = tokens[index2], tokens[index1] # n_sen = ' '.join(lis) n_sentence = untokenize(tokens) # return new sentence return n_sentence
def check_lines(source): lines = token_utils.get_lines(source) tokens = [] for line in lines: tokens.extend(line) assert source == token_utils.untokenize(tokens)
def toValidEqn(source): """This adds a multiplication symbol where it would be understood as being implicit by the normal way algebraic equations are written but would be a SyntaxError in Python. Thus we have:: 2N -> 2*N N 2 -> N* 2 2(A+B) -> 2*(A+B) (A+B)2 -> (A+B)*2 2 3 -> 2* 3 M N -> M* N (A+B)C -> (A+B)*C A(3) -> A*(3) a(3) -> a(3) - will only add multiplication if the preceding token is capital, since that is a variable """ """ Modified from ideas https://github.com/aroberge/ideas/blob/master/ideas/examples/implicit_multiplication.py """ constants = [ 'BLUE', 'RED', 'BLACK', 'MAGENTA', 'GREEN', 'ORANGE', 'BROWN', 'NAVY', 'LTBLUE', 'YELLOW', 'WHITE', 'LTGRAY', 'MEDGRAY', 'GRAY', 'DARKGRAY' ] tokens = token_utils.tokenize(source) if not tokens: return tokens prev_token = tokens[0] new_tokens = [prev_token] for token in tokens[1:]: if token.is_not_in(constants): # Check if implicit multiplication should be added if (((prev_token.is_number() or (prev_token.is_identifier() and prev_token.string.isupper())) and ((token.is_identifier() and token.string.isupper()) or token.is_number() or token == "(")) or ((prev_token.is_identifier() and prev_token.string.isupper()) and ((token.is_identifier() and token.string.isupper()) or token.is_number())) or (prev_token == ")" and ((token.is_identifier() and token.string.isupper()) or token.is_number()))): new_tokens.append("*") if token.is_identifier() and token.string.isupper() and len( token.string) > 1: # Multiple variables next to one another # ABC -> A*B*C token.string = '*'.join(token.string) new_tokens.append(token) else: new_tokens.append(token) else: # Token in constants, skip new_tokens.append(token) prev_token = token return token_utils.untokenize(new_tokens)
for function in functions: replace_obfuscatables(module, tokens, obfuscate_function, function, name_generator, table) for _class in classes: replace_obfuscatables(module, tokens, obfuscate_class, _class, name_generator, table) obfuscate_global_import_methods(module, tokens, name_generator, table) obfuscate_builtins(module, tokens, name_generator, table) if __name__ == "__main__": global name_generator if len(sys.argv) != 3: print("Usage: %s <emoji_length> <filename.py>" % sys.argv[0]) sys.exit(1) source = open(sys.argv[2]).read() replacement_length = int(sys.argv[1]) tokens = token_utils.listified_tokenizer(source) source = minification.minify(tokens) tokens = token_utils.listified_tokenizer(source) obfuscate(source, tokens, replacement_length) result = '' result += token_utils.untokenize(tokens) # print(result)
def test_indent(): new_tokens = token_utils.indent(tokens2, 4) new_line_a = token_utils.untokenize(new_tokens) new_line_b = token_utils.untokenize(lines3[2]) assert new_line_a == new_line_b
def convert_switch(source, predictable_names=False): """Replaces code like:: switch EXPR: case EXPR_1: SUITE case EXPR_2: SUITE case in EXPR_3, EXPR_4, ...: SUITE ... else: SUITE by:: var_name = EXPR if var_name == EXPR_1: SUITE elif var_name == EXPR_2: SUITE elif var_name in EXPR_3, EXPR_4, ...: SUITE else: SUITE del var_name Limitation: switch blocks cannot be part of a SUITE of another switch block. """ new_tokens = [] switch_block = False first_case = False if predictable_names: variable_name = utils.generate_predictable_names() else: variable_name = utils.generate_variable_names() for line in token_utils.get_lines(source): first_token = token_utils.get_first(line) if first_token is None: new_tokens.extend(line) continue if len(line) > 1: _index = token_utils.get_first_index(line) second_token = line[_index + 1] else: second_token = None if not switch_block: if first_token == "switch": switch_indent = first_token.start_col var_name = next(variable_name) first_token.string = f"{var_name} =" switch_block = True first_case = True colon = token_utils.get_last(line) colon.string = "" else: if first_token.start_col == switch_indent: switch_block = False new_tokens.extend([" " * switch_indent + f"del {var_name}\n"]) elif first_token == "case" or first_token == "else": if first_case and first_token == "case": if second_token == "in": first_token.string = f"if {var_name}" else: first_token.string = f"if {var_name} ==" first_case = False elif first_token == "case": if second_token == "in": first_token.string = f"elif {var_name}" else: first_token.string = f"elif {var_name} ==" dedent = first_token.start_col - switch_indent line = token_utils.dedent(line, dedent) new_tokens.extend(line) return token_utils.untokenize(new_tokens)
def check(source): tokens = token_utils.tokenize(source) new_source = token_utils.untokenize(tokens) print(len(source), len(new_source)) assert source == new_source