def predict(self): input_filename = 'Input.java' print('Starting interactive prediction...') while True: print( 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) user_input = input() if user_input.lower() in self.exit_keywords: print('Exiting...') return try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) except ValueError as e: print(e) continue results = self.model.predict(predict_lines) prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) for method_prediction in prediction_results: print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) print('Attention:') for attention_obj in method_prediction.attention_paths: print('%f\tcontext: %s,%s,%s' % (attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2']))
def predict_route(): if request.content_length > 10000000: abort(400) fd, path = tempfile.mkstemp() try: with os.fdopen(fd, 'w') as tmp: tmp.write(request.get_data(as_text=True)) predict_lines, hash_to_string_dict = SERVER.path_extractor.extract_paths(path) finally: os.remove(path) results = SERVER.model.predict(predict_lines) method_prediction = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS)[0] ans = { 'predictions': method_prediction.predictions, 'attention_paths': method_prediction.attention_paths } # for method_prediction in prediction_results: # print('Original name:\t' + method_prediction.original_name) # for name_prob_pair in method_prediction.predictions: # print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) # print('Attention:') # for attention_obj in method_prediction.attention_paths: # print('%f\tcontext: %s,%s,%s' % ( # attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) return json.dumps(ans)
def predict(self): input_filename = 'Input.java' print('Starting interactive prediction...') while True: print( 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) user_input = input() if user_input.lower() in self.exit_keywords: print('Exiting...') return try: print("input_filename(ohazyi) = ", input_filename) predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) print("predict_lines(ohazyi) = ", predict_lines) print("hash_to_string_dict(ohazyi) = ", hash_to_string_dict) except ValueError as e: print(e) continue results, code_vectors = self.model.predict(predict_lines) print('results(ohazyi)=', results) print('code_vectors(ohazyi)', code_vectors) import numpy as np print("code_vectors(ohazyi).shape", np.array(code_vectors).shape) prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) print("prediction_results(ohazyi)=", prediction_results) for i, method_prediction in enumerate(prediction_results): print("i=", i) print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) print('Attention:') for attention_obj in method_prediction.attention_paths: print('%f\tcontext: %s,%s,%s' % (attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: print("Yes(ohazyi)!!!") print('Code vector:') print(' '.join(map(str, code_vectors[i])))
def predict(self): files = list_all_files('/Users/apple/Desktop/test') print('Starting interactive prediction...') out = open("./data/ABC.txt", mode='w') for file in files: if file.split('.')[-1] == 'java': # print(file) try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(file) except ValueError as e: print(e) continue results, code_vectors = self.model.predict(predict_lines) prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) for i, method_prediction in enumerate(prediction_results): out.write(file + ' ' + method_prediction.original_name + ' ' + ' '.join(map(str, code_vectors[i]))) out.write('\n') out.close()
def dn_predict(self): # input_filename = 'Input.java' # input_filename = input() print('Starting interactive prediction...') data_list = glob.glob("data/in_use/*/*.java") for input_filename in data_list: # while True: # print( # 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) # user_input = input() # input_filename = input() # if user_input.lower() in self.exit_keywords: print(input_filename) if input_filename.lower() in self.exit_keywords: print('Exiting...') return try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) except ValueError as e: print(e) continue results, code_vectors = self.model.predict(predict_lines) prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) for i, method_prediction in enumerate(prediction_results): print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) print('Attention:') for attention_obj in method_prediction.attention_paths: print('%f\tcontext: %s,%s,%s' % (attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: print('Code vector:') print(' '.join(map(str, code_vectors[i]))) with open('jms_output.txt', 'a') as f_out: f_out.write("{}\t{}\n".format( input_filename, ', '.join(map(str, code_vectors[i]))))
'size.') args = parser.parse_args() funcs = file2function_array(args.filename) # build up hash to path dict h2p_dict = {} for f in funcs: h2p_dict.update(f.get_pathdict()) with tf.device('/cpu:0'): config = Config.get_default_config(args) model = Model(config) results, code_vector = model.predict( [f.to_str_with_padding() for f in funcs if f.has_pair()]) prediction_results = common.parse_results(results, h2p_dict) for method_prediction in prediction_results: print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) print('Attention:') for attention_obj in method_prediction.attention_paths: print('%f\tcontext: %s,%s,%s' % (attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) model.close_session()
def predict(self): input_filename = 'Input.java' # MAX_ATTEMPTS = 50 # MAX_NODES_TO_OPEN = 10 word_to_indextop, indextop_to_word = self.model.create_ordered_words_dictionary( self.model.get_data_dictionaries_path(self.config.LOAD_PATH), self.config.MAX_WORDS_FROM_VOCAB_FOR_ADVERSARIAL) print('Starting interactive prediction with BFS adversarial search... (depth = {}, topk = {})' .format(self.max_depth, self.topk)) while True: print( 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) user_input = input() if user_input.lower() in self.exit_keywords: print('Exiting...') return with open(input_filename, "r") as f: original_code = f.read() try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename) except ValueError as e: print(e) continue var_code_split_index = predict_lines[0].find(" ") original_code = predict_lines[0][var_code_split_index + 1:] results = self.model.predict([original_code]) prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) for method_prediction in prediction_results: print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) # generate pca self.model.creat_PCA_tokens(predict_lines[0]) # Search for adversarial examples print("select variable to rename OR -- to skip search:") var_to_rename = input() if var_to_rename == "--": continue while True: print("select attack type: 'nontargeted' for non-targeted attack") print("OR target method name for targeted attack (each word is seperated by |)") attack_type = input() # untargeted searcher if attack_type == "nontargeted": print("Using non-targeted attack") searcher = AdversarialSearcher(self.topk, self.max_depth, word_to_indextop, indextop_to_word, predict_lines[0], lambda c, v: [(var_to_rename, var_to_rename)]) break else: # targeted searcher if attack_type in self.model.target_word_to_index: print("Using targeted attack. target:", attack_type) searcher = AdversarialTargetedSearcher(self.topk, self.max_depth, word_to_indextop, indextop_to_word, predict_lines[0], attack_type, lambda c, v: [(var_to_rename, var_to_rename)]) break print(attack_type, "not existed in vocab! try again") adversarial_results = [] # original_prediction = '|'.join(method_prediction.predictions[0]['name']) while True: batch_nodes_data = [(n, c) for n, c in searcher.pop_unchecked_adversarial_code()] batch_data = [c for _, c in batch_nodes_data] results = self.model.predict(batch_data, self.guard_input) for (node, _), res in zip(batch_nodes_data, results): one_top_words = res[1] one_top_words = common.filter_impossible_names(one_top_words) if not one_top_words: print("code with state: " + str(node) + " cause empty predictions\n") continue if searcher.is_target_found(one_top_words): adversarial_results.append((one_top_words[0],node)) if adversarial_results and not self.multiple_results: break batch_data = [searcher.get_adversarial_code()] batch_word_to_derive = [searcher.get_word_to_derive()] loss, all_grads = self.model.calc_loss_and_gradients_wrt_input(batch_data, batch_word_to_derive, indextop_to_word) if not searcher.next((0, "", all_grads[0])): break if not results: print("FAILD! no replaces found") else: print("variable replaces:") print("Prediction\tnode") for r in adversarial_results: print(r[0],"\t",r[1])
def predict(self): input_filename = 'Input.java' MAX_ATTEMPTS = 50 MAX_NODES_TO_OPEN = 10 words_vocab = self.model.get_words_vocab_embed() # words_vocab = words_vocab / np.linalg.norm(words_vocab, axis=1).reshape((-1, 1)) print('Starting interactive prediction with similar adversarial search...') while True: print( 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) user_input = input() if user_input.lower() in self.exit_keywords: print('Exiting...') return print("select variable to rename:") var_to_rename = newname_of_var = input() name_found = False closed = [var_to_rename] with open(input_filename, "r") as f: original_code = f.read() for i in range(MAX_ATTEMPTS): try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename) except ValueError as e: print(e) continue results = self.model.predict(predict_lines) prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) for method_prediction in prediction_results: print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) if '|'.join(method_prediction.predictions[0]['name']) != method_prediction.original_name: print("MATCH FOUND!", newname_of_var) print("Tried (total:", len(closed), ") :: ", closed) name_found = True break # print('Attention:') # for attention_obj in method_prediction.attention_paths: # print('%f\tcontext: %s,%s,%s' % ( # attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) loss, all_strings, all_grads = self.model.calc_loss_and_gradients_wrt_input(predict_lines) indecies_of_var = np.argwhere(all_strings == newname_of_var.lower()).flatten() grads_of_var = all_grads[indecies_of_var] if grads_of_var.shape[0] > 0: # print("current loss:",loss) total_grad = np.sum(grads_of_var, axis=0) # # words to increase loss # top_replace_with = np.argsort(total_grad)[::-1][:5] # result = [(i, total_grad[i], self.model.index_to_word[i]) for i in top_replace_with] # print("words to increase loss:") # print(result) # words to decrease loss # top_replace_with = np.argsort(total_grad)[:5] similarity_to_var = self.get_similar_words(newname_of_var) #self.measureSimilarity(words_vocab,newname_of_var, "euclidean") result = [(self.model.word_to_index[i], i, total_grad[self.model.word_to_index[i]]) for i in similarity_to_var] result.sort(key=lambda v: (-v[2])) print(result) # similarity_to_var = self.measureSimilarity(words_vocab,newname_of_var, "cosine") # resulte = [(i, self.model.index_to_word[i], similarity_to_var[i], total_grad[i]) for i in range(1, words_vocab.shape[0])] # # resulte.sort(key=lambda v: (v[2],-v[3])) # select new name for r in result: if r[1] not in closed and r[1] != method_prediction.original_name.replace("|","")\ and r[2] > 0: print(r) newname_of_var = r[1] break else: newname_of_var = None if newname_of_var is None: break closed.append(newname_of_var) print("rename", var_to_rename, "to", newname_of_var) code = InteractivePredictor.rename_variable(original_code,var_to_rename,newname_of_var) with open("input.java", "w") as f: f.write(code) if not name_found: print("FAILED!") print("Tried (total:", len(closed),") :: ", closed)
def predict(self): input_filename = 'Input.java' MAX_ATTEMPTS = 50 MAX_NODES_TO_OPEN = 10 print('Starting interactive prediction with mono adversarial search...') while True: print( 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) user_input = input() if user_input.lower() in self.exit_keywords: print('Exiting...') return print("select variable to rename:") var_to_rename = newname_of_var = input() name_found = False opened = [] closed = [] with open(input_filename, "r") as f: original_code = f.read() for i in range(MAX_ATTEMPTS): try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename) except ValueError as e: print(e) continue bfs = AdversarialSearcher(2, 2, self.model) r = bfs.find_adversarial(predict_lines) print(r) print(timeit.timeit(lambda: bfs.find_adversarial(predict_lines), number=1000)) return results = self.model.predict(predict_lines) prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) for method_prediction in prediction_results: print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) if '|'.join(method_prediction.predictions[0]['name']) == method_prediction.original_name: print("MATCH FOUND!", newname_of_var) print("Tried (total:", len(closed), ") :: ", closed) name_found = True break # print('Attention:') # for attention_obj in method_prediction.attention_paths: # print('%f\tcontext: %s,%s,%s' % ( # attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) loss, all_strings, all_grads = self.model.calc_loss_and_gradients_wrt_input(predict_lines) indecies_of_var = np.argwhere(all_strings == newname_of_var.lower()).flatten() grads_of_var = all_grads[indecies_of_var] if grads_of_var.shape[0] > 0: # print("current loss:",loss) total_grad = np.sum(grads_of_var, axis=0) # # words to increase loss # top_replace_with = np.argsort(total_grad)[::-1][:5] # result = [(i, total_grad[i], self.model.index_to_word[i]) for i in top_replace_with] # print("words to increase loss:") # print(result) # words to decrease loss top_replace_with = np.argsort(total_grad)[:5] result = [(i, total_grad[i], self.model.index_to_word[i]) for i in top_replace_with] # select new name for r in result: if r[2] not in closed and r[2] != method_prediction.original_name.replace("|",""): print(r) newname_of_var = r[2] break else: newname_of_var = None if newname_of_var is None: break closed.append(newname_of_var) print("rename", var_to_rename, "to", newname_of_var) code = InteractivePredictor.rename_variable(original_code,var_to_rename,newname_of_var) with open("input.java", "w") as f: f.write(code) if not name_found: print("FAILED!") print("Tried (total:", len(closed),") :: ", closed)
def predict(self): input_filename = 'Input.java' # MAX_ATTEMPTS = 50 # MAX_NODES_TO_OPEN = 10 src_folder = "test_adversarial/src" # input_src = ["contains.java", "count.java", "done.java", "escape.java", "factorial.java", "get.java", # "indexOf.java", "isPrime.java", "postRequest.java", "reverseArray.java", "sort.java"] input_src = os.listdir(src_folder) targets = [ "sort", "contains", "get", "index|of", "done", "reverse|array", "count", "is|prime", "post|request", "escape", "add", "close", "main", "max", "min", "factorial", "load", "foo", "update", "bar", "exception", "test", "swap", "predict" ] word_to_indextop, indextop_to_word = self.model.create_ordered_words_dictionary( self.model.get_data_dictionaries_path(self.config.LOAD_PATH), self.config.MAX_WORDS_FROM_VOCAB_FOR_ADVERSARIAL) print( 'Starting interactive prediction with BFS adversarial search... (depth = {}, topk = {})' .format(self.max_depth, self.topk)) for src in input_src: print('SAMPLE: ', src) input_filename = src_folder + "/" + src try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) except ValueError as e: print(e) continue var, original_code = common_adversarial.separate_vars_code( predict_lines[0]) # ignore methods without vars if not common_adversarial.get_all_vars(var): print("NO VARS. skip.") continue results = self.model.predict([original_code]) prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) # skip method that were predicted wrong method_prediction = prediction_results[0] if method_prediction.original_name.lower() != "".join( method_prediction.predictions[0]['name']): print("WRONG PREDICTION. skip. (true: {}, pred: {})".format( method_prediction.original_name, method_prediction.predictions)) continue for method_prediction in prediction_results: print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) # Search for adversarial examples print("ADVERSARIAL results:") for target in targets: print("TARGET:", target) if target != "nontargeted" and target not in self.model.target_word_to_index: print("target not exist. skip.") continue for var_to_rename in common_adversarial.get_all_vars(var): # untargeted searcher if target == "nontargeted": searcher = AdversarialSearcher( self.topk, self.max_depth, word_to_indextop, indextop_to_word, predict_lines[0], lambda c, v: [(var_to_rename, var_to_rename)]) else: # targeted searcher searcher = AdversarialTargetedSearcher( self.topk, self.max_depth, word_to_indextop, indextop_to_word, predict_lines[0], target, lambda c, v: [(var_to_rename, var_to_rename)]) adversarial_results = [] while True: batch_nodes_data = [ (n, c) for n, c in searcher.pop_unchecked_adversarial_code() ] batch_data = [c for _, c in batch_nodes_data] results = self.model.predict(batch_data, self.guard_input) for (node, _), res in zip(batch_nodes_data, results): one_top_words = res[1] one_top_words = common.filter_impossible_names( one_top_words) if not one_top_words: print("code with state: " + str(node) + " cause empty predictions\n") continue if searcher.is_target_found(one_top_words): adversarial_results.append( (one_top_words[0], node)) if adversarial_results and not self.multiple_results: break batch_data = [searcher.get_adversarial_code()] batch_word_to_derive = [searcher.get_word_to_derive()] loss, all_grads = self.model.calc_loss_and_gradients_wrt_input( batch_data, batch_word_to_derive, indextop_to_word) if not searcher.next((0, "", all_grads[0])): break for r in adversarial_results: print(r[0], "\t\t\t", r[1])
def create_file_vectors(self): folder_dir = os.path.join(dataset_dir, self.data_dir) file_vectors = [] fileNum = 0 # Loop through each class value for class_val in os.listdir(folder_dir): # Get each file from each class class_folder = os.path.join(folder_dir, class_val) if os.path.isdir(class_folder): file_list = os.listdir(class_folder) # Limit the number of files per class if len(file_list) > self.k: print( "File list over the limit, randomly selecting {} files..." .format(self.k)) random.seed(42) file_list = random.sample(file_list, self.k) for file in file_list: time0 = time.time() method_vectors = [] # Split the file into its composing methods methods = self.class_preprocessor.get_methods( os.path.join(class_folder, file)) # for each of it's composing methods for method in methods: # Get number of lines in the method lines = method.count('\n') # Spit it into a temp file try: with open(tmp_file_name, mode='w') as tmp_file: tmp_file.write(method) except Exception as e: if debug: print("{}\n{}".format(e, method)) # Make the predictions try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( tmp_file_name) except ValueError as e: print("=====================") if debug: print( "Error for method {} in file {}. Note this may simply be caused by the method being a constructor" .format(method, file)) print("\nException message:\n") print(e) print("=====================") continue results, code_vectors = self.model.predict( predict_lines) prediction_results = common.parse_results( results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) # Process the predictions for i, method_prediction in enumerate( prediction_results): method_vectors.append({ "vector": code_vectors[i], "length": lines }) file_vectors.append({ 'methods': method_vectors, 'class_val': class_val, 'filename': file, 'processed': False }) print("#{}\t{}\tTime: {}s".format( fileNum, file, round(time.time() - time0, 3))) fileNum += 1 os.remove(tmp_file_name) return file_vectors