def predict(self):
     input_filename = 'Input.java'
     print('Starting interactive prediction...')
     while True:
         print(
             'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit'
             % input_filename)
         user_input = input()
         if user_input.lower() in self.exit_keywords:
             print('Exiting...')
             return
         try:
             predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(
                 input_filename)
         except ValueError as e:
             print(e)
             continue
         results = self.model.predict(predict_lines)
         prediction_results = common.parse_results(results,
                                                   hash_to_string_dict,
                                                   topk=SHOW_TOP_CONTEXTS)
         for method_prediction in prediction_results:
             print('Original name:\t' + method_prediction.original_name)
             for name_prob_pair in method_prediction.predictions:
                 print('\t(%f) predicted: %s' %
                       (name_prob_pair['probability'],
                        name_prob_pair['name']))
             print('Attention:')
             for attention_obj in method_prediction.attention_paths:
                 print('%f\tcontext: %s,%s,%s' %
                       (attention_obj['score'], attention_obj['token1'],
                        attention_obj['path'], attention_obj['token2']))
示例#2
0
 def predict_route():
     if request.content_length > 10000000:
         abort(400)
     fd, path = tempfile.mkstemp()
     try:
         with os.fdopen(fd, 'w') as tmp:
             tmp.write(request.get_data(as_text=True))
         predict_lines, hash_to_string_dict = SERVER.path_extractor.extract_paths(path)
     finally:
         os.remove(path)
     results = SERVER.model.predict(predict_lines)
     method_prediction = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS)[0]
     ans = { 
         'predictions': method_prediction.predictions,
         'attention_paths': method_prediction.attention_paths
     }
     # for method_prediction in prediction_results:
     #     print('Original name:\t' + method_prediction.original_name)
     #     for name_prob_pair in method_prediction.predictions:
     #         print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name']))
     #     print('Attention:')
     #     for attention_obj in method_prediction.attention_paths:
     #         print('%f\tcontext: %s,%s,%s' % (
     #         attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2']))
     return json.dumps(ans)
示例#3
0
 def predict(self):
     input_filename = 'Input.java'
     print('Starting interactive prediction...')
     while True:
         print(
             'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit'
             % input_filename)
         user_input = input()
         if user_input.lower() in self.exit_keywords:
             print('Exiting...')
             return
         try:
             print("input_filename(ohazyi) = ", input_filename)
             predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(
                 input_filename)
             print("predict_lines(ohazyi) = ", predict_lines)
             print("hash_to_string_dict(ohazyi) = ", hash_to_string_dict)
         except ValueError as e:
             print(e)
             continue
         results, code_vectors = self.model.predict(predict_lines)
         print('results(ohazyi)=', results)
         print('code_vectors(ohazyi)', code_vectors)
         import numpy as np
         print("code_vectors(ohazyi).shape", np.array(code_vectors).shape)
         prediction_results = common.parse_results(results,
                                                   hash_to_string_dict,
                                                   topk=SHOW_TOP_CONTEXTS)
         print("prediction_results(ohazyi)=", prediction_results)
         for i, method_prediction in enumerate(prediction_results):
             print("i=", i)
             print('Original name:\t' + method_prediction.original_name)
             for name_prob_pair in method_prediction.predictions:
                 print('\t(%f) predicted: %s' %
                       (name_prob_pair['probability'],
                        name_prob_pair['name']))
             print('Attention:')
             for attention_obj in method_prediction.attention_paths:
                 print('%f\tcontext: %s,%s,%s' %
                       (attention_obj['score'], attention_obj['token1'],
                        attention_obj['path'], attention_obj['token2']))
             if self.config.EXPORT_CODE_VECTORS:
                 print("Yes(ohazyi)!!!")
                 print('Code vector:')
                 print(' '.join(map(str, code_vectors[i])))
 def predict(self):
     files = list_all_files('/Users/apple/Desktop/test')
     print('Starting interactive prediction...')
     out = open("./data/ABC.txt", mode='w')
     for file in files:
         if file.split('.')[-1] == 'java':
             # print(file)
             try:
                 predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(file)
             except ValueError as e:
                 print(e)
                 continue
             results, code_vectors = self.model.predict(predict_lines)
             prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS)
             for i, method_prediction in enumerate(prediction_results):
                 out.write(file + ' ' + method_prediction.original_name + ' ' + ' '.join(map(str, code_vectors[i])))
                 out.write('\n')
     out.close()
示例#5
0
 def dn_predict(self):
     # input_filename = 'Input.java'
     # input_filename = input()
     print('Starting interactive prediction...')
     data_list = glob.glob("data/in_use/*/*.java")
     for input_filename in data_list:
         # while True:
         # print(
         # 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename)
         # user_input = input()
         # input_filename = input()
         # if user_input.lower() in self.exit_keywords:
         print(input_filename)
         if input_filename.lower() in self.exit_keywords:
             print('Exiting...')
             return
         try:
             predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(
                 input_filename)
         except ValueError as e:
             print(e)
             continue
         results, code_vectors = self.model.predict(predict_lines)
         prediction_results = common.parse_results(results,
                                                   hash_to_string_dict,
                                                   topk=SHOW_TOP_CONTEXTS)
         for i, method_prediction in enumerate(prediction_results):
             print('Original name:\t' + method_prediction.original_name)
             for name_prob_pair in method_prediction.predictions:
                 print('\t(%f) predicted: %s' %
                       (name_prob_pair['probability'],
                        name_prob_pair['name']))
             print('Attention:')
             for attention_obj in method_prediction.attention_paths:
                 print('%f\tcontext: %s,%s,%s' %
                       (attention_obj['score'], attention_obj['token1'],
                        attention_obj['path'], attention_obj['token2']))
             if self.config.EXPORT_CODE_VECTORS:
                 print('Code vector:')
                 print(' '.join(map(str, code_vectors[i])))
                 with open('jms_output.txt', 'a') as f_out:
                     f_out.write("{}\t{}\n".format(
                         input_filename,
                         ', '.join(map(str, code_vectors[i]))))
示例#6
0
        'size.')
    args = parser.parse_args()

    funcs = file2function_array(args.filename)

    # build up hash to path dict
    h2p_dict = {}
    for f in funcs:
        h2p_dict.update(f.get_pathdict())

    with tf.device('/cpu:0'):
        config = Config.get_default_config(args)
        model = Model(config)
        results, code_vector = model.predict(
            [f.to_str_with_padding() for f in funcs if f.has_pair()])

    prediction_results = common.parse_results(results, h2p_dict)

    for method_prediction in prediction_results:
        print('Original name:\t' + method_prediction.original_name)
        for name_prob_pair in method_prediction.predictions:
            print('\t(%f) predicted: %s' %
                  (name_prob_pair['probability'], name_prob_pair['name']))
        print('Attention:')
        for attention_obj in method_prediction.attention_paths:
            print('%f\tcontext: %s,%s,%s' %
                  (attention_obj['score'], attention_obj['token1'],
                   attention_obj['path'], attention_obj['token2']))

    model.close_session()
示例#7
0
    def predict(self):
        input_filename = 'Input.java'
        # MAX_ATTEMPTS = 50
        # MAX_NODES_TO_OPEN = 10

        word_to_indextop, indextop_to_word = self.model.create_ordered_words_dictionary(
            self.model.get_data_dictionaries_path(self.config.LOAD_PATH),
            self.config.MAX_WORDS_FROM_VOCAB_FOR_ADVERSARIAL)

        print('Starting interactive prediction with BFS adversarial search... (depth = {}, topk = {})'
              .format(self.max_depth, self.topk))
        while True:
            print(
                'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename)
            user_input = input()
            if user_input.lower() in self.exit_keywords:
                print('Exiting...')
                return



            with open(input_filename, "r") as f:
                original_code = f.read()

            try:
                predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename)
            except ValueError as e:
                print(e)
                continue

            var_code_split_index = predict_lines[0].find(" ")
            original_code = predict_lines[0][var_code_split_index + 1:]

            results = self.model.predict([original_code])
            prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS)
            for method_prediction in prediction_results:
                print('Original name:\t' + method_prediction.original_name)
                for name_prob_pair in method_prediction.predictions:
                    print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name']))

            # generate pca
            self.model.creat_PCA_tokens(predict_lines[0])

            # Search for adversarial examples
            print("select variable to rename OR -- to skip search:")
            var_to_rename = input()
            if var_to_rename == "--":
                continue

            while True:
                print("select attack type: 'nontargeted' for non-targeted attack")
                print("OR target method name for targeted attack (each word is seperated by |)")
                attack_type = input()

                # untargeted searcher
                if attack_type == "nontargeted":
                    print("Using non-targeted attack")
                    searcher = AdversarialSearcher(self.topk, self.max_depth, word_to_indextop, indextop_to_word, predict_lines[0],
                                                   lambda c, v: [(var_to_rename, var_to_rename)])
                    break

                else: # targeted searcher
                    if attack_type in self.model.target_word_to_index:
                        print("Using targeted attack. target:", attack_type)
                        searcher = AdversarialTargetedSearcher(self.topk, self.max_depth, word_to_indextop, indextop_to_word,
                                                              predict_lines[0], attack_type,
                                                              lambda c, v: [(var_to_rename, var_to_rename)])
                        break

                    print(attack_type, "not existed in vocab! try again")

            adversarial_results = []
            # original_prediction = '|'.join(method_prediction.predictions[0]['name'])

            while True:
                batch_nodes_data = [(n, c) for n, c in searcher.pop_unchecked_adversarial_code()]
                batch_data = [c for _, c in batch_nodes_data]
                results = self.model.predict(batch_data, self.guard_input)
                for (node, _), res in zip(batch_nodes_data, results):
                    one_top_words = res[1]
                    one_top_words = common.filter_impossible_names(one_top_words)
                    if not one_top_words:
                        print("code with state: " +
                                          str(node) + " cause empty predictions\n")
                        continue

                    if searcher.is_target_found(one_top_words):
                        adversarial_results.append((one_top_words[0],node))


                if adversarial_results and not self.multiple_results:
                    break

                batch_data = [searcher.get_adversarial_code()]
                batch_word_to_derive = [searcher.get_word_to_derive()]
                loss, all_grads = self.model.calc_loss_and_gradients_wrt_input(batch_data, batch_word_to_derive,
                                                                                            indextop_to_word)
                if not searcher.next((0, "", all_grads[0])):
                    break

            if not results:
                print("FAILD! no replaces found")
            else:
                print("variable replaces:")
                print("Prediction\tnode")
                for r in adversarial_results:
                    print(r[0],"\t",r[1])
示例#8
0
    def predict(self):
        input_filename = 'Input.java'
        MAX_ATTEMPTS = 50
        MAX_NODES_TO_OPEN = 10

        words_vocab = self.model.get_words_vocab_embed()
        # words_vocab = words_vocab / np.linalg.norm(words_vocab, axis=1).reshape((-1, 1))

        print('Starting interactive prediction with similar adversarial search...')
        while True:
            print(
                'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename)
            user_input = input()
            if user_input.lower() in self.exit_keywords:
                print('Exiting...')
                return

            print("select variable to rename:")
            var_to_rename = newname_of_var = input()

            name_found = False
            closed = [var_to_rename]
            with open(input_filename, "r") as f:
                original_code = f.read()

            for i in range(MAX_ATTEMPTS):
                try:
                    predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename)
                except ValueError as e:
                    print(e)
                    continue
                results = self.model.predict(predict_lines)
                prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS)
                for method_prediction in prediction_results:
                    print('Original name:\t' + method_prediction.original_name)
                    for name_prob_pair in method_prediction.predictions:
                        print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name']))

                if '|'.join(method_prediction.predictions[0]['name']) != method_prediction.original_name:
                    print("MATCH FOUND!", newname_of_var)
                    print("Tried (total:", len(closed), ") :: ", closed)
                    name_found = True
                    break

                    # print('Attention:')
                    # for attention_obj in method_prediction.attention_paths:
                    #     print('%f\tcontext: %s,%s,%s' % (
                    #     attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2']))

                loss, all_strings, all_grads = self.model.calc_loss_and_gradients_wrt_input(predict_lines)
                indecies_of_var = np.argwhere(all_strings == newname_of_var.lower()).flatten()
                grads_of_var = all_grads[indecies_of_var]
                if grads_of_var.shape[0] > 0:
                    # print("current loss:",loss)
                    total_grad = np.sum(grads_of_var, axis=0)
                    # # words to increase loss
                    # top_replace_with = np.argsort(total_grad)[::-1][:5]
                    # result = [(i, total_grad[i], self.model.index_to_word[i]) for i in top_replace_with]
                    # print("words to increase loss:")
                    # print(result)
                    # words to decrease loss
                    # top_replace_with = np.argsort(total_grad)[:5]

                    similarity_to_var = self.get_similar_words(newname_of_var) #self.measureSimilarity(words_vocab,newname_of_var, "euclidean")
                    result = [(self.model.word_to_index[i], i, total_grad[self.model.word_to_index[i]]) for i in similarity_to_var]

                    result.sort(key=lambda v: (-v[2]))

                    print(result)
                    # similarity_to_var = self.measureSimilarity(words_vocab,newname_of_var, "cosine")
                    # resulte = [(i, self.model.index_to_word[i], similarity_to_var[i], total_grad[i]) for i in range(1, words_vocab.shape[0])]
                    #
                    # resulte.sort(key=lambda v: (v[2],-v[3]))

                    # select new name
                    for r in result:
                        if r[1] not in closed and r[1] != method_prediction.original_name.replace("|","")\
                                and r[2] > 0:
                            print(r)
                            newname_of_var = r[1]
                            break
                    else:
                        newname_of_var = None
                    if newname_of_var is None:
                        break
                    closed.append(newname_of_var)

                    print("rename", var_to_rename, "to", newname_of_var)

                    code = InteractivePredictor.rename_variable(original_code,var_to_rename,newname_of_var)
                    with open("input.java", "w") as f:
                        f.write(code)


            if not name_found:
                print("FAILED!")
                print("Tried (total:", len(closed),") :: ", closed)
示例#9
0
    def predict(self):
        input_filename = 'Input.java'
        MAX_ATTEMPTS = 50
        MAX_NODES_TO_OPEN = 10

        print('Starting interactive prediction with mono adversarial search...')
        while True:
            print(
                'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename)
            user_input = input()
            if user_input.lower() in self.exit_keywords:
                print('Exiting...')
                return

            print("select variable to rename:")
            var_to_rename = newname_of_var = input()

            name_found = False

            opened = []
            closed = []
            with open(input_filename, "r") as f:
                original_code = f.read()

            for i in range(MAX_ATTEMPTS):
                try:
                    predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(input_filename)
                except ValueError as e:
                    print(e)
                    continue

                bfs = AdversarialSearcher(2, 2, self.model)
                r = bfs.find_adversarial(predict_lines)
                print(r)
                print(timeit.timeit(lambda: bfs.find_adversarial(predict_lines), number=1000))


                return
                results = self.model.predict(predict_lines)
                prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS)
                for method_prediction in prediction_results:
                    print('Original name:\t' + method_prediction.original_name)
                    for name_prob_pair in method_prediction.predictions:
                        print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name']))

                if '|'.join(method_prediction.predictions[0]['name']) == method_prediction.original_name:
                    print("MATCH FOUND!", newname_of_var)
                    print("Tried (total:", len(closed), ") :: ", closed)
                    name_found = True
                    break

                    # print('Attention:')
                    # for attention_obj in method_prediction.attention_paths:
                    #     print('%f\tcontext: %s,%s,%s' % (
                    #     attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2']))

                loss, all_strings, all_grads = self.model.calc_loss_and_gradients_wrt_input(predict_lines)
                indecies_of_var = np.argwhere(all_strings == newname_of_var.lower()).flatten()
                grads_of_var = all_grads[indecies_of_var]
                if grads_of_var.shape[0] > 0:
                    # print("current loss:",loss)
                    total_grad = np.sum(grads_of_var, axis=0)
                    # # words to increase loss
                    # top_replace_with = np.argsort(total_grad)[::-1][:5]
                    # result = [(i, total_grad[i], self.model.index_to_word[i]) for i in top_replace_with]
                    # print("words to increase loss:")
                    # print(result)
                    # words to decrease loss
                    top_replace_with = np.argsort(total_grad)[:5]
                    result = [(i, total_grad[i], self.model.index_to_word[i]) for i in top_replace_with]

                    # select new name
                    for r in result:
                        if r[2] not in closed and r[2] != method_prediction.original_name.replace("|",""):
                            print(r)
                            newname_of_var = r[2]
                            break
                    else:
                        newname_of_var = None
                    if newname_of_var is None:
                        break
                    closed.append(newname_of_var)

                    print("rename", var_to_rename, "to", newname_of_var)

                    code = InteractivePredictor.rename_variable(original_code,var_to_rename,newname_of_var)
                    with open("input.java", "w") as f:
                        f.write(code)


            if not name_found:
                print("FAILED!")
                print("Tried (total:", len(closed),") :: ", closed)
    def predict(self):
        input_filename = 'Input.java'
        # MAX_ATTEMPTS = 50
        # MAX_NODES_TO_OPEN = 10

        src_folder = "test_adversarial/src"
        # input_src = ["contains.java", "count.java", "done.java", "escape.java", "factorial.java", "get.java",
        #              "indexOf.java", "isPrime.java", "postRequest.java", "reverseArray.java", "sort.java"]
        input_src = os.listdir(src_folder)
        targets = [
            "sort", "contains", "get", "index|of", "done", "reverse|array",
            "count", "is|prime", "post|request", "escape", "add", "close",
            "main", "max", "min", "factorial", "load", "foo", "update", "bar",
            "exception", "test", "swap", "predict"
        ]

        word_to_indextop, indextop_to_word = self.model.create_ordered_words_dictionary(
            self.model.get_data_dictionaries_path(self.config.LOAD_PATH),
            self.config.MAX_WORDS_FROM_VOCAB_FOR_ADVERSARIAL)

        print(
            'Starting interactive prediction with BFS adversarial search... (depth = {}, topk = {})'
            .format(self.max_depth, self.topk))
        for src in input_src:
            print('SAMPLE: ', src)

            input_filename = src_folder + "/" + src

            try:
                predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(
                    input_filename)
            except ValueError as e:
                print(e)
                continue

            var, original_code = common_adversarial.separate_vars_code(
                predict_lines[0])

            # ignore methods without vars
            if not common_adversarial.get_all_vars(var):
                print("NO VARS. skip.")
                continue

            results = self.model.predict([original_code])
            prediction_results = common.parse_results(results,
                                                      hash_to_string_dict,
                                                      topk=SHOW_TOP_CONTEXTS)
            # skip method that were predicted wrong
            method_prediction = prediction_results[0]
            if method_prediction.original_name.lower() != "".join(
                    method_prediction.predictions[0]['name']):
                print("WRONG PREDICTION. skip. (true: {}, pred: {})".format(
                    method_prediction.original_name,
                    method_prediction.predictions))
                continue
            for method_prediction in prediction_results:
                print('Original name:\t' + method_prediction.original_name)
                for name_prob_pair in method_prediction.predictions:
                    print('\t(%f) predicted: %s' %
                          (name_prob_pair['probability'],
                           name_prob_pair['name']))

            # Search for adversarial examples
            print("ADVERSARIAL results:")

            for target in targets:
                print("TARGET:", target)
                if target != "nontargeted" and target not in self.model.target_word_to_index:
                    print("target not exist. skip.")
                    continue

                for var_to_rename in common_adversarial.get_all_vars(var):
                    # untargeted searcher
                    if target == "nontargeted":
                        searcher = AdversarialSearcher(
                            self.topk, self.max_depth, word_to_indextop,
                            indextop_to_word, predict_lines[0],
                            lambda c, v: [(var_to_rename, var_to_rename)])
                    else:  # targeted searcher
                        searcher = AdversarialTargetedSearcher(
                            self.topk, self.max_depth, word_to_indextop,
                            indextop_to_word, predict_lines[0], target,
                            lambda c, v: [(var_to_rename, var_to_rename)])

                    adversarial_results = []

                    while True:
                        batch_nodes_data = [
                            (n, c) for n, c in
                            searcher.pop_unchecked_adversarial_code()
                        ]
                        batch_data = [c for _, c in batch_nodes_data]
                        results = self.model.predict(batch_data,
                                                     self.guard_input)
                        for (node, _), res in zip(batch_nodes_data, results):
                            one_top_words = res[1]
                            one_top_words = common.filter_impossible_names(
                                one_top_words)
                            if not one_top_words:
                                print("code with state: " + str(node) +
                                      " cause empty predictions\n")
                                continue

                            if searcher.is_target_found(one_top_words):
                                adversarial_results.append(
                                    (one_top_words[0], node))

                        if adversarial_results and not self.multiple_results:
                            break

                        batch_data = [searcher.get_adversarial_code()]
                        batch_word_to_derive = [searcher.get_word_to_derive()]
                        loss, all_grads = self.model.calc_loss_and_gradients_wrt_input(
                            batch_data, batch_word_to_derive, indextop_to_word)
                        if not searcher.next((0, "", all_grads[0])):
                            break

                    for r in adversarial_results:
                        print(r[0], "\t\t\t", r[1])
    def create_file_vectors(self):
        folder_dir = os.path.join(dataset_dir, self.data_dir)
        file_vectors = []
        fileNum = 0
        # Loop through each class value
        for class_val in os.listdir(folder_dir):
            # Get each file from each class
            class_folder = os.path.join(folder_dir, class_val)
            if os.path.isdir(class_folder):
                file_list = os.listdir(class_folder)

                # Limit the number of files per class
                if len(file_list) > self.k:
                    print(
                        "File list over the limit, randomly selecting {} files..."
                        .format(self.k))
                    random.seed(42)
                    file_list = random.sample(file_list, self.k)

                for file in file_list:
                    time0 = time.time()
                    method_vectors = []

                    # Split the file into its composing methods
                    methods = self.class_preprocessor.get_methods(
                        os.path.join(class_folder, file))

                    # for each of it's composing methods
                    for method in methods:
                        # Get number of lines in the method
                        lines = method.count('\n')

                        # Spit it into a temp file
                        try:
                            with open(tmp_file_name, mode='w') as tmp_file:
                                tmp_file.write(method)
                        except Exception as e:
                            if debug:
                                print("{}\n{}".format(e, method))

                        # Make the predictions
                        try:
                            predict_lines, hash_to_string_dict = self.path_extractor.extract_paths(
                                tmp_file_name)
                        except ValueError as e:
                            print("=====================")
                            if debug:
                                print(
                                    "Error for method {} in file {}. Note this may simply be caused by the method being a constructor"
                                    .format(method, file))
                            print("\nException message:\n")
                            print(e)
                            print("=====================")
                            continue

                        results, code_vectors = self.model.predict(
                            predict_lines)
                        prediction_results = common.parse_results(
                            results,
                            hash_to_string_dict,
                            topk=SHOW_TOP_CONTEXTS)

                        # Process the predictions
                        for i, method_prediction in enumerate(
                                prediction_results):
                            method_vectors.append({
                                "vector": code_vectors[i],
                                "length": lines
                            })

                    file_vectors.append({
                        'methods': method_vectors,
                        'class_val': class_val,
                        'filename': file,
                        'processed': False
                    })

                    print("#{}\t{}\tTime: {}s".format(
                        fileNum, file, round(time.time() - time0, 3)))
                    fileNum += 1

        os.remove(tmp_file_name)
        return file_vectors