def feats_pca(cnn, vid_ids_path, num_vids, org_dim, new_dim): feat_save_path = config.MSVD_FEATS_DIR + cnn + "_pca" + str(new_dim) + "/" print "saving feats to :", feat_save_path utils.create_dir_if_not_exist(feat_save_path) vid_ids = utils.read_file_to_list(vid_ids_path) vid_clips_list = [vid[:-4] for vid in vid_ids] assert len(vid_ids) == num_vids vid_feats_all = np.empty((0, org_dim), dtype=np.float32) for vid in vid_clips_list: # print("loading features from : "+vid) vid_feats_path = config.MSVD_FEATS_DIR + cnn + "/" + vid + ".npy" vid_feats = np.load(vid_feats_path) # print(vid_feats.shape) vid_feat_avg = np.mean(vid_feats, axis=0) # print(vid_feat_avg.shape) vid_feats_all = np.vstack((vid_feats_all, vid_feat_avg)) print(vid_feats_all.shape) # vid_feats_scaled = StandardScaler().fit_transform(vid_feats_all) vid_feats_pca = PCA(n_components=new_dim).fit_transform(vid_feats_all) print(vid_feats_pca.shape) for ind in range(num_vids): vid = vid_clips_list[ind] vid_feat = vid_feats_pca[ind] # print("saving features from : "+vid) np.save(feat_save_path + vid + ".npy", vid_feat)
def main(): parser = argparse.ArgumentParser(prog="mipsal", description='Assemble and link a MIPS assembly program.') parser.add_argument("files", action="store", nargs="+", type=str, help="list of assembly files to process") parser.add_argument("--int", action="store_true", default=False, help="output intermediate files") parser.add_argument("--obj", action="store_true", default=False, help="output object files") parser.add_argument("-o", action="store", dest="out_name", type=str, default="mips.out", help="override output file name", metavar="file_name") parser.add_argument("-l", "--link", action="append", help="add file to program when linking. This option can be used more than once", metavar="file_name") args = parser.parse_args() obj_code = [] for input_file in args.files: ints, objs = assembler.assemble(input_file) obj_code.append(objs) file_name = utils.get_file_name(input_file) if args.int: int_file = file_name + ".int" utils.write_file_from_list(int_file, ints) if args.obj: obj_file = file_name + ".o" utils.write_file_from_list(obj_file, objs) if args.link != None: for link_file in args.link: obj_code.append([x.strip() for x in utils.read_file_to_list(link_file)]) output = linker.link(obj_code) utils.write_file_from_list(args.out_name, output)
def __fetch_sensitive_apis(self): """ :return: 0 success 1 failure """ self.sensitive_apis = [] output_apis_txt_path = os.path.join(self.dst_output_path, 'apis.txt') if not os.path.exists(output_apis_txt_path): smali_search_result = glob.glob(os.path.join( self.smali_dir_path, "**\\*.smali"), recursive=True) for smali_file in smali_search_result: if extract_sensitive_apis_list_from_smali( self.sensitive_apis, smali_file) != STATUS_OK: print('extract apis failed') return STATUS_ERR write_list_to_file(self.sensitive_apis, output_apis_txt_path) if (not self.sensitive_apis) and read_file_to_list( self.sensitive_apis, output_apis_txt_path) != STATUS_OK: return STATUS_ERR else: get_filtered_vector( self.feature_list, self.sensitive_apis, CONSTANTS['SENSITIVE_APIS_106']['REFERENCE_LIST']) return STATUS_OK
def __fetch_intent_actions(self): """ :return: 0 success 1 failure """ self.intent_actions = [] output_actions_txt_path = os.path.join(self.dst_output_path, 'actions.txt') if not os.path.exists(output_actions_txt_path): if extract_spec_list_from_file( self.intent_actions, self.am_processed_path, EXTRACT_SPECS['ACTION']) != STATUS_OK: return STATUS_ERR write_list_to_file(self.intent_actions, output_actions_txt_path) if (not self.intent_actions) and read_file_to_list( self.intent_actions, output_actions_txt_path) != STATUS_OK: return STATUS_ERR else: if self.include_intent_actions_126: get_filtered_vector( self.feature_list, self.intent_actions, CONSTANTS['INTENT_ACTIONS_126']['REFERENCE_LIST']) return STATUS_OK elif self.include_intent_actions_110: get_filtered_vector( self.feature_list, self.intent_actions, CONSTANTS['INTENT_ACTIONS_110']['REFERENCE_LIST']) return STATUS_OK
def _init(): read_file_to_list(CONSTANTS['PERMISSIONS_147']['REFERENCE_LIST'], CONSTANTS['PERMISSIONS_147']['REFERENCE_FILE']) read_file_to_list(CONSTANTS['INTENT_ACTIONS_126']['REFERENCE_LIST'], CONSTANTS['INTENT_ACTIONS_126']['REFERENCE_FILE']) read_file_to_list(CONSTANTS['INTENT_ACTIONS_110']['REFERENCE_LIST'], CONSTANTS['INTENT_ACTIONS_110']['REFERENCE_FILE']) read_file_to_list(CONSTANTS['SENSITIVE_APIS_106']['REFERENCE_LIST'], CONSTANTS['SENSITIVE_APIS_106']['REFERENCE_FILE']) read_file_to_list(CONSTANTS['ANDROID_PACKAGES']['REFERENCE_LIST'], CONSTANTS['ANDROID_PACKAGES']['REFERENCE_FILE']) # for api_level, api_dict in CONSTANTS['PERMISSION_MAPPINGS'].items(): # read_permission_map_file_to_dict( # api_dict['REFERENCE_DICT'], api_dict['REFERENCE_FILE']) read_permission_map_file_to_dict( CONSTANTS['PERMISSION_MAPPINGS']['16']['REFERENCE_DICT'], CONSTANTS['PERMISSION_MAPPINGS']['16']['REFERENCE_FILE'])
def main(): parser = argparse.ArgumentParser(prog="mipsl", description='Link a MIPS program from multiple object files.') parser.add_argument("files", action="store", nargs="+", type=str, help="list of object files to process") parser.add_argument("-o", action="store", dest="out_name", type=str, default="mips.out", help="override output file name", metavar="file_name") args = parser.parse_args() obj_code = [] for link_file in args.files: obj_code.append([x.strip() for x in utils.read_file_to_list(link_file)]) output = link(obj_code) utils.write_file_from_list(args.out_name, output)
def __fetch_package_call_graph(self): """ Extract by package name """ # Read the adjacency matrix and list of functions all_funcs = [] read_file_to_list(all_funcs, os.path.join(self.dst_output_path, 'all_funcs.txt')) adj = sp.csr_matrix( sp.load_npz(os.path.join(self.dst_output_path, 'adj_matrix.npz'))) features = sp.csr_matrix( sp.load_npz(os.path.join(self.dst_output_path, 'node_features.npz'))) """ 65 * 65 """ pkg_adj_matrix = np.zeros((65, 65), dtype=np.uint32) pkg_adj_matrix[0, :] = np.ones((1, 65), dtype=np.uint32) """ 65 * (147 + 126) """ pkg_node_features = np.zeros((65, 273), dtype=np.uint32) pkg_node_features[0, :] = np.array(np.load( os.path.join( self.dst_output_path, 'features_' + str(self.requested_features - 32) + '.npy'))[0:273], dtype=np.uint32) for i in range(len(all_funcs)): caller_idx = self.__get_pkg_idx(all_funcs[i]) if caller_idx == -1: continue else: pkg_node_features[caller_idx, :] = pkg_node_features[ caller_idx, :] + features[i, :].todense() called_idxs = adj[i].todense().tolist()[0] for j in range(len(called_idxs)): if called_idxs[j] == 1: called_idx = self.__get_pkg_idx(all_funcs[j]) pkg_adj_matrix[caller_idx, called_idx] = 1 np.save(os.path.join(self.dst_output_path, 'pkg_adj_matrix.npy'), pkg_adj_matrix) np.save(os.path.join(self.dst_output_path, 'pkg_node_features.npy'), pkg_node_features)
def __fetch_pkg_features(self): all_funcs = [] read_file_to_list(all_funcs, os.path.join(self.dst_output_path, 'all_funcs.txt')) node_feat_path = os.path.join(self.dst_output_path, 'node_features.npz') pkg_features = np.zeros((65, 273), dtype=np.uint8) node_features = sp.csr_matrix(sp.load_npz(node_feat_path)) num = node_features.shape[0] for i in range(len(all_funcs))[1:]: if i >= num: break idx = self.__get_pkg_idx(all_funcs[i]) if idx == -1: pkg_features[ 0, :] = pkg_features[0, :] + node_features[i, :].todense() else: pkg_features[idx, :] = pkg_features[idx, :] + node_features[ i, :].todense() np.save(os.path.join(self.dst_output_path, 'pkg_features.npy'), pkg_features)
def main(): parser = argparse.ArgumentParser( prog="mipsal", description='Assemble and link a MIPS assembly program.') parser.add_argument("files", action="store", nargs="+", type=str, help="list of assembly files to process") parser.add_argument("--int", action="store_true", default=False, help="output intermediate files") parser.add_argument("--obj", action="store_true", default=False, help="output object files") parser.add_argument("-o", action="store", dest="out_name", type=str, default="mips.out", help="override output file name", metavar="file_name") parser.add_argument( "-l", "--link", action="append", help= "add file to program when linking. This option can be used more than once", metavar="file_name") args = parser.parse_args() obj_code = [] for input_file in args.files: ints, objs = assembler.assemble(input_file) obj_code.append(objs) file_name = utils.get_file_name(input_file) if args.int: int_file = file_name + ".int" utils.write_file_from_list(int_file, ints) if args.obj: obj_file = file_name + ".o" utils.write_file_from_list(obj_file, objs) if args.link != None: for link_file in args.link: obj_code.append( [x.strip() for x in utils.read_file_to_list(link_file)]) output = linker.link(obj_code) utils.write_file_from_list(args.out_name, output)
def load_data(self): print('loading {}-{} features'.format(self.dataset_name, self.cnn_name)) self.train_data_ids = utils.read_file_to_list(self.train_data_ids_path) self.val_data_ids = utils.read_file_to_list(self.val_data_ids_path) self.test_data_ids = utils.read_file_to_list(self.test_data_ids_path) utils.shuffle_array(self.train_data_ids) utils.shuffle_array(self.val_data_ids) utils.shuffle_array(self.test_data_ids) self.train_data_ids = self.train_data_ids[: 1] # ONLY FOR DEBUG - REMOVE self.val_data_ids = self.val_data_ids[:1] self.test_data_ids = self.test_data_ids[:1] self.train_caps = utils.read_from_json(self.train_caps_path) self.val_caps = utils.read_from_json(self.val_caps_path) self.test_caps = utils.read_from_json(self.test_caps_path) self.vocab = utils.read_from_json(self.vocab_path) self.reverse_vocab = utils.read_from_pickle(self.reverse_vocab_path) self.vocab_size = len(self.vocab) if self.cnn_name in ['ResNet50', 'ResNet152', 'InceptionV3']: self.ctx_dim = 2048 elif self.cnn_name in ['MURALI']: self.ctx_dim = 1024 elif self.cnn_name in ['VGG19']: self.ctx_dim = 512 else: raise NotImplementedError() self.train_ids = self.get_vid_ids(self.train_data_ids) self.val_ids = self.get_vid_ids(self.val_data_ids) self.test_ids = self.get_vid_ids(self.test_data_ids) self.kf_train = utils.generate_minibatch_idx(len(self.train_data_ids), self.mb_size_train) self.kf_val = utils.generate_minibatch_idx(len( self.val_data_ids), self.mb_size_test) #TODO - verify test or val self.kf_test = utils.generate_minibatch_idx(len(self.test_data_ids), self.mb_size_test)
def __init__(self, root, transform=None, target_transform=None): classes, class_to_idx = self._find_classes(root) samples = _make_dataset(root, class_to_idx, IMG_EXTENSIONS) cooccuring_tracks_file = os.path.join(root, "cooccurring_tracks.txt") with open(cooccuring_tracks_file) as file: self.cooccurring_tracks = [[int(n) for n in line.split(',')] for line in file] if len(samples) == 0: raise (RuntimeError("Found 0 files in subfolders of: " + root + "\n" "Supported extensions are: " + ",".join(IMG_EXTENSIONS))) self.root = root self.classes = classes self.class_to_idx = class_to_idx self.samples = samples self.track_targets = [s[1] for s in samples] # self.gt_targets = [s[2] for s in samples] track_to_gt_list = utils.read_file_to_list( os.path.join(root, 'track_gt.txt')) track_to_gt_dict = utils.list_to_dict(track_to_gt_list) gtclass_to_idx = {} gt_idx = 0 gt_targets = [] for track_id in self.track_targets: if track_to_gt_dict[track_id] not in gtclass_to_idx.keys(): gtclass_to_idx[track_to_gt_dict[track_id]] = gt_idx gt_idx += 1 label = gtclass_to_idx[track_to_gt_dict[track_id]] gt_targets.append(label) self.gt_targets = gt_targets track_idx_to_sample_idx = {} for track_idx in np.unique(self.track_targets): track_idx_to_sample_idx[track_idx] = np.where( self.track_targets == track_idx)[0] self.track_idx_to_sample_idx = track_idx_to_sample_idx self.transform = transform self.target_transform = target_transform
def split_data(csv_data): vid_ids = utils.read_file_to_list(config.MSVD_VID_IDS_ALL_PATH) assert len(vid_ids) == config.TOTAL_VIDS utils.shuffle_array(vid_ids) train_ids = vid_ids[0:1200] val_ids = vid_ids[1200:1300] test_ids = vid_ids[1300:1970] assert len(train_ids) == config.TRAIN_VIDS assert len(val_ids) == config.VAL_VIDS assert len(test_ids) == config.TEST_VIDS utils.write_list_to_file(config.MSVD_VID_IDS_TRAIN_PATH, train_ids) utils.write_list_to_file(config.MSVD_VID_IDS_VAL_PATH, val_ids) utils.write_list_to_file(config.MSVD_VID_IDS_TEST_PATH, test_ids) train_df = filter_df(csv_data, train_ids, config.MSVD_FINAL_CORPUS_TRAIN_PATH) val_df = filter_df(csv_data, val_ids, config.MSVD_FINAL_CORPUS_VAL_PATH) test_df = filter_df(csv_data, test_ids, config.MSVD_FINAL_CORPUS_TEST_PATH) return train_df, val_df, test_df
def frames_to_feat(cnn, vid_ids_path, num_vids): if cnn == "ResNet50": model, height, width, preprocess_input = get_ResNet50_model() FEAT_DIM = config.RESNET_FEAT_DIM elif cnn == "ResNet152": model, height, width, preprocess_input = get_ResNet152_model() FEAT_DIM = config.RESNET_FEAT_DIM elif cnn == "InceptionV3": model, height, width, preprocess_input = get_InceptionV3_model() FEAT_DIM = config.INCEPTION_FEAT_DIM elif cnn == "VGG19": model, height, width, preprocess_input = get_VGG19_model() FEAT_DIM = config.VGG_FEAT_DIM else: raise NotImplementedError() feat_save_path = config.MSVD_FEATS_DIR + cnn + "/" print "saving feats to :", feat_save_path utils.create_dir_if_not_exist(feat_save_path) vid_ids = utils.read_file_to_list(vid_ids_path) vid_clips_list = [vid[:-4] for vid in vid_ids] assert len(vid_ids) == num_vids for vid in vid_clips_list: print("extracting features from : " + vid) vid_frames_dir = config.MSVD_FRAMES_DIR + "/" + vid frames_list = utils.read_dir(vid_frames_dir) n_frames = len(frames_list) if n_frames > config.MAX_FRAMES: n_frames = config.MAX_FRAMES selected_frames = extract_frames_equally_spaced( n_frames, config.FRAME_SPACING) vid_feats = np.empty((0, FEAT_DIM), dtype=np.float32) for fid in selected_frames: img_path = vid_frames_dir + "/frame" + str(fid) + ".jpg" # print("extracting features from : "+img_path) img_feat = img_to_feat(img_path, height, width, preprocess_input, model) vid_feats = np.vstack((vid_feats, img_feat)) print(vid_feats.shape) np.save(feat_save_path + vid + ".npy", vid_feats)
def feats_kmeans(cnn, vid_ids_path, num_vids, org_dim, k): feat_save_path = config.MSVD_FEATS_DIR + cnn + "_kmeans" + str(k) + "/" print "saving feats to :", feat_save_path utils.create_dir_if_not_exist(feat_save_path) vid_ids = utils.read_file_to_list(vid_ids_path) vid_clips_list = [vid[:-4] for vid in vid_ids] assert len(vid_ids) == num_vids for vid in vid_clips_list: # print("loading features from : "+vid) vid_feats_path = config.MSVD_FEATS_DIR + cnn + "/" + vid + ".npy" vid_feats = np.load(vid_feats_path) # print(vid_feats.shape) kmeans = KMeans(n_clusters=k, init='k-means++', random_state=0).fit(vid_feats) vid_feat_kmeans = kmeans.cluster_centers_ # print(vid_feat_kmeans.shape) np.save(feat_save_path + vid + ".npy", vid_feat_kmeans)
def __fetch_permissions(self): """ :return: 0 success 1 failure """ self.permissions = [] output_permissions_txt_path = os.path.join(self.dst_output_path, 'permissions.txt') if not os.path.exists(output_permissions_txt_path): if extract_spec_list_from_file( self.permissions, self.am_processed_path, EXTRACT_SPECS['PERMISSION']) != STATUS_OK: return STATUS_ERR write_list_to_file(self.permissions, output_permissions_txt_path) if (not self.permissions) and read_file_to_list( self.permissions, output_permissions_txt_path) != STATUS_OK: return STATUS_ERR else: get_filtered_vector(self.feature_list, self.permissions, CONSTANTS['PERMISSIONS_147']['REFERENCE_LIST']) return STATUS_OK
def assemble(input_file): cleaned = [strip_comments(line).strip() for line in utils.read_file_to_list(input_file)] asm = [line for line in cleaned if line != ""] symtbl = SymbolTable(False) reltbl = SymbolTable(True) # Pass One intermediate, errors_one = pass_one(asm, symtbl) # Pass Two output, errors_two = pass_two(intermediate, symtbl, reltbl) if len(errors_one) > 0: print("Errors during pass one:") for line_num, e in errors_one: print("Error: line {0}: {1}".format(line_num, e)) if len(errors_two) > 0: print("Errors during pass two:") for line_num, e in errors_two: print("Error: line {0}: {1}".format(line_num, e)) if len(errors_one) > 0 or len(errors_two) > 0: print("One or more errors encountered during assembly operation") return intermediate, output
def assemble(input_file): cleaned = [ strip_comments(line).strip() for line in utils.read_file_to_list(input_file) ] asm = [line for line in cleaned if line != ""] symtbl = SymbolTable(False) reltbl = SymbolTable(True) # Pass One intermediate, errors_one = pass_one(asm, symtbl) # Pass Two output, errors_two = pass_two(intermediate, symtbl, reltbl) if len(errors_one) > 0: print("Errors during pass one:") for line_num, e in errors_one: print("Error: line {0}: {1}".format(line_num, e)) if len(errors_two) > 0: print("Errors during pass two:") for line_num, e in errors_two: print("Error: line {0}: {1}".format(line_num, e)) if len(errors_one) > 0 or len(errors_two) > 0: print("One or more errors encountered during assembly operation") return intermediate, output
def main(): parser = argparse.ArgumentParser( prog="mipsl", description='Link a MIPS program from multiple object files.') parser.add_argument("files", action="store", nargs="+", type=str, help="list of object files to process") parser.add_argument("-o", action="store", dest="out_name", type=str, default="mips.out", help="override output file name", metavar="file_name") args = parser.parse_args() obj_code = [] for link_file in args.files: obj_code.append( [x.strip() for x in utils.read_file_to_list(link_file)]) output = link(obj_code) utils.write_file_from_list(args.out_name, output)
def total_fuel_from_mass(mass): fuel = fuel_from_mass(mass) if fuel <= 0: return 0 else: return fuel + total_fuel_from_mass(fuel) def sum_updated_list_of_mass_modules(mass_list): return reduce(lambda total, current: total + total_fuel_from_mass(current), mass_list, 0) if __name__ == '__main__': mass_modules = read_file_to_list("input/01.txt") print(sum_list_of_mass_modules(mass_modules)) print(sum_updated_list_of_mass_modules(mass_modules)) class Test(unittest.TestCase): def test_setup_properly(self): self.assertEqual(2, 1 + 1) def test_mass_of_twelve_equals_fuel_of_two(self): fuel = fuel_from_mass(12) self.assertEqual(fuel, 2) def test_mass_of_ninteen_sixty_nine_equals_fuel_of_six_fifty_four(self): fuel = fuel_from_mass(1969)
import utils utils.getVersion() input = utils.read_file_to_list("input.txt") # print(input) size = len(input) utils.log.debug("len: {}".format(size)) seats = [] seats = [i for i in range(127 * 8 + 8)] max_seat_id = 0 for i in range(0, size): seat_id = utils.get_seat_id(input[i]) seats[seat_id] = 'X' if seat_id > max_seat_id: max_seat_id = seat_id utils.log.info( "solution part 1 ==> max_seat_id: {}".format(max_seat_id)) # ==> 928 possible_seats = 0 my_seat = -1 for i in range(0, len(seats)): if 1 < i < len(seats): if seats[i - 1] == 'X' and seats[i + 1] == 'X' and seats[i] != 'X': utils.log.debug("empty seat: {}".format(seats[i])) my_seat = seats[i] possible_seats += 1
intcode[target] = intcode[augend] + intcode[addend] elif opcode == "MULTIPLICATION": multiplier = intcode[pointer + 1] multiplicand = intcode[pointer + 2] target = intcode[pointer + 3] intcode[target] = intcode[multiplier] * intcode[multiplicand] else: raise ValueError("opcode should be 1, 2 or 99") pointer += 4 return intcode if __name__ == "__main__": intcode = [int(x) for x in read_file_to_list("input/02.txt")[0].split(",")] # replace two positions with hardcoded data (via instructions) intcode[1] = 12 intcode[2] = 2 print(process_intcode(intcode)) print(f"the value at position 0 after the program halts is: {intcode[0]}") intcode = [int(x) for x in read_file_to_list("input/02.txt")[0].split(",")] output_19690720 = find_output_19690720(intcode) print(f"100 * noun + verb = {100 * output_19690720[0] + output_19690720[1]}") class Test(unittest.TestCase): def test_setup_properly(self):
def gen_vocab(df, whichdata): if whichdata == "test": outfname = config.MURALI_MSVD_VID_CAPS_TEST_PATH dictsize = config.MURALI_TEST_VIDS capspath = config.MURALI_MSVD_CAPTIONS_TEST_PATH elif whichdata == "val": outfname = config.MURALI_MSVD_VID_CAPS_VAL_PATH dictsize = config.MURALI_VAL_VIDS capspath = None raise NotImplementedError() else: outfname = config.MURALI_MSVD_VID_CAPS_TRAIN_PATH dictsize = config.MURALI_TRAIN_VIDS capspath = config.MURALI_MSVD_CAPTIONS_TRAIN_PATH vocab = set() punct_dict = get_punctuations() translator = string.maketrans("", "") vid_caps_dict = {} omitted_caps = [] for index in range(dictsize): vid_id = whichdata + "_" + str(index) descriptions = utils.read_file_to_list(capspath + str(index) + ".txt")[0].split("|") vid_caps = [] for desc in descriptions: try: cap = desc.strip().encode('UTF-8') if len(cap) > 0: vid_caps.append(cap) except Exception as e: # print vid_id, " : ", desc.strip() omitted_caps.append(vid_id + " : " + desc.strip()) for vid_cap in vid_caps: tokens, _ = tokenize(vid_cap, punct_dict, translator) if (vid_id in vid_caps_dict): vid_caps_dict[vid_id].append(tokens) else: vid_caps_dict[vid_id] = [tokens] if whichdata == "train": vocab |= set(tokens) print("Non-ASCII captions omitted :" + str(len(omitted_caps))) utils.write_to_json(vid_caps_dict, outfname) print("Size of " + whichdata + " vid caps dict: " + str(len(vid_caps_dict))) assert len(vid_caps_dict) == dictsize if whichdata == "train": vocab_list = list(vocab) vocab_list.sort() vocab_dict = { vocab_list[index]: index + 2 for index in range(len(vocab_list)) } # vocab_dict['<bos>'] = 0 vocab_dict['<eos>'] = 0 vocab_dict['UNK'] = 1 vocab_rev_dict = { index + 2: vocab_list[index] for index in range(len(vocab_list)) } # vocab_rev_dict[0] = '<bos>' vocab_rev_dict[0] = '<eos>' vocab_rev_dict[1] = 'UNK' utils.write_to_json(vocab_dict, config.MURALI_MSVD_VOCAB_PATH) utils.write_to_pickle(vocab_rev_dict, config.MURALI_MSVD_REVERSE_VOCAB_PATH) print("Size of Vocabulary: " + str(len(vocab))) return vocab, vid_caps_dict, omitted_caps
def __fetch_function_call_graph(self): """ :return: """ """ Deal with the component corresponding action Component names in the AndroidManifest file are either complete or incomplete. If there is only one word, it is considered incomplete In comp_dict, key is the class name,value is the action feature value All methods in the class inherit this action feature """ comp_dict = {} for comp_match in COMPONENT_PATTERN.finditer(self.am_content): action_list = [] comp_action_features = [] comp_detail = comp_match.group(0) comp_name = comp_match.group('compname') if comp_name.startswith('.'): comp_name = self.package_name + comp_name elif len(comp_name.split('.')) == 1: comp_name = self.package_name + '.' + comp_name class_path = join_class_path(comp_name) for action_match in INTENT_ACTION_PATTERN.finditer(comp_detail): action_list.append(action_match.group('action').split('.')[-1]) get_filtered_vector( comp_action_features, action_list, CONSTANTS['INTENT_ACTIONS_126']['REFERENCE_LIST']) comp_dict[class_path] = np.array(comp_action_features, dtype=np.uint8, ndmin=2) output_func_call_pairs_txt_path = os.path.join(self.dst_output_path, 'func_call_pairs.txt') if not os.path.exists(output_func_call_pairs_txt_path): temp_dict = {} smali_search_result = glob.glob(os.path.join( self.smali_dir_path, "**\\*.smali"), recursive=True) for smali_file in smali_search_result: if extract_func_call_pairs_list_from_smali( temp_dict, smali_file) != STATUS_OK: print('extract func call pairs failed') return STATUS_ERR self.func_call_pairs = list(temp_dict.keys()) write_list_to_file(self.func_call_pairs, output_func_call_pairs_txt_path) temp_dict.clear() if (not self.func_call_pairs) and read_file_to_list( self.func_call_pairs, output_func_call_pairs_txt_path) != STATUS_OK: return STATUS_ERR all_funcs_set = set() for call_pair in self.func_call_pairs: temp_list = call_pair.split(' ') if len(temp_list) == 3: all_funcs_set.add(temp_list[0]) all_funcs_set.add(temp_list[2]) elif len(temp_list) == 2: print('length 2 -> ' + ','.join(temp_list)) elif len(temp_list) == 1: print('length 1 -> ' + ','.join(temp_list)) elif len(temp_list) == 0: print('length 0') else: print('other length ' + str(len(temp_list))) # have a MainNode,,, self.nodes_num = len(list(all_funcs_set)) + 1 if self.nodes_num > 30000: return STATUS_ERR all_funcs_set = None print('nodes num->', self.nodes_num) self.adj_matrix = np.zeros((self.nodes_num, self.nodes_num), dtype=np.uint8) self.node_features = np.zeros((self.nodes_num, 273), dtype=np.uint8) self.node_labels = [] all_funcs = [] api_lv_match = TARGET_SDK_VER_PATTERN.search(self.am_content) if not api_lv_match: api_lv_match = MIN_SDK_VER_PATTERN.search(self.am_content) if api_lv_match and int(api_lv_match.group('apilevel')) >= 16: self.api_level = api_lv_match.group('apilevel') # The construct mainNode is characterized by the entire app, and its tag is the tag of the app,malicious 10 benign 01 all_funcs.append('MainNode') self.node_labels.append( [1, 0]) if self.is_malicious else self.node_labels.append([0, 1]) self.adj_matrix[0] = np.ones((1, self.nodes_num), dtype=np.uint8) self.node_features[0] = np.array(self.feature_list, dtype=np.uint8)[0:273] for call_pair in self.func_call_pairs: temp_list = call_pair.split(' ') if len(temp_list) == 3: caller = temp_list[0] called = temp_list[2] """ Extract by API """ # row :caller| column :called caller_idx = self.__process_func(caller, all_funcs, comp_dict) called_idx = self.__process_func(called, all_funcs, comp_dict) self.adj_matrix[caller_idx, called_idx] = 1 elif len(temp_list) == 2: print('length 2 -> ' + ','.join(temp_list)) elif len(temp_list) == 1: print('length 1 -> ' + ','.join(temp_list)) elif len(temp_list) == 0: print('length 0') else: print('other length ' + str(len(temp_list))) write_list_to_file(all_funcs, os.path.join(self.dst_output_path, 'all_funcs.txt')) return STATUS_OK
print('Loading model from checkpoint {}'.format( config.model.checkpoint_path)) checkpoint = torch.load(config.model.checkpoint_path) embedding_size = checkpoint['embedding_size'] # CUDA for PyTorch use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") model = models.load_model(config.model.model_arch, device, embedding_size=embedding_size) model.load_state_dict(checkpoint['model_state_dict']) filename = os.path.join(config.dataset.movie.dataset_path, 'bbx.txt') bbx_list = utils.read_file_to_list(filename) plotter = utils.VisdomPlotter(config.visdom.server, env_name='video_annotation', port=config.visdom.port) vd_utils.annotate_video( config.dataset.movie.movie_path, config.output.video_dir, model, device, max_frame=config.dataset.movie.num_frame, bbx_list=bbx_list, tracker_max_age=config.hyperparameters.tracker_max_age, plotter=plotter, name='base')
def annotate_video(movie_file_path: str, dataset_path: str, output_path: str, model: nn.Module, device, max_frame: int = 100000, tracker_max_age: int = 10, plotter: utils.plotter_utils.VisdomPlotter = None, name: str = '', compute_track_mean: bool = False): filename = os.path.join(dataset_path, 'bbx.txt') print('Getting annotations from {}'.format(filename)) bbx_list = utils.read_file_to_list(filename) if bbx_list: bounding_boxes_list = bbx_list else: bounding_boxes_list = get_bounding_boxes(movie_file_path, max_frame=max_frame, tracker_max_age=tracker_max_age) print('Extracting ROI of the video.') cropped_image_list = get_cropped_images(movie_file_path, bounding_boxes_list, max_frame=max_frame) track_dict = get_track_dict(bounding_boxes_list) frame_dict = get_frame_dict(bounding_boxes_list) bbx_dict = get_bbx_dict(bounding_boxes_list) # Data transform data_transform = transforms.Compose([ transforms.ToTensor() ]) dataset = NumpyDataset(cropped_image_list, transform=data_transform) dataloader = torch.utils.data.DataLoader(dataset, num_workers=2, batch_size=100) print('Extracting features.') model = model.to(device) features = ml_utils.extract_features(dataloader, model, device) cluster_techniques_list = ['kmeans', 'spectral', 'hac'] tsne_features, tsne_chosen_samples = projection_utils.tsne_projection(features) pca_features, pca_chosen_samples = projection_utils.pca_projection(features) # Frame level clustering print('Performing frame level clustering.') for cluster_method in cluster_techniques_list: cluster_name = '{}_frame_level_{}'.format(name, cluster_method) predictions, data_dict = clustering.cluster_techniques(features, cluster_method, max_clusters=10) write_video(movie_file_path, output_path, predictions, frame_dict, name=cluster_name, max_frame=max_frame) plotter.scatter_plot(cluster_name + '_tsne', tsne_features, predictions[tsne_chosen_samples]) plotter.scatter_plot(cluster_name + '_pca', pca_features, predictions[pca_chosen_samples]) # Add ground truth if it exist gt_file_path = os.path.join(dataset_path, 'bbx_gt.txt') if os.path.isfile(gt_file_path): print('Creating ground truth video and plots.') bbx_to_gt_list = utils.read_file_to_list(gt_file_path) bbx_to_gt_dict = utils.list_to_dict(bbx_to_gt_list) groundtruth = [] gt_to_idx_dict = {} bbx_count = 0 for bbx in bounding_boxes_list: bbx_idx = bbx[2] gt = bbx_to_gt_dict[bbx_idx] if gt not in gt_to_idx_dict.keys(): gt_to_idx_dict[gt] = bbx_count bbx_count += 1 label = gt_to_idx_dict[gt] groundtruth.append(label) groundtruth = np.array(groundtruth) gt_name = '{}_gt'.format(name) write_video(movie_file_path, output_path, groundtruth, frame_dict, name=gt_name, max_frame=max_frame) plotter.scatter_plot(gt_name + '_tsne', tsne_features, groundtruth[tsne_chosen_samples]) plotter.scatter_plot(gt_name + '_pca', pca_features, groundtruth[pca_chosen_samples]) # Track level clustering if compute_track_mean: print('Performing track level clustering.') mean_features = [] track_to_idx_dict = {} for idx, track_idx in enumerate(track_dict.keys()): feature_track = features[track_dict[track_idx]] mean_features.append(np.mean(feature_track, axis=0)) track_to_idx_dict[track_idx] = idx mean_features = np.asarray(mean_features) for cluster_method in cluster_techniques_list: cluster_name = '{}_track_level_{}'.format(name, cluster_method) mean_predictions, data_dict = clustering.cluster_techniques(mean_features, cluster_method, max_clusters=10) predictions = [] for bbx_idx in bbx_dict.keys(): track_idx = track_to_idx_dict[bbx_dict[bbx_idx][0]] predictions.append(mean_predictions[track_idx]) predictions = np.array(predictions) write_video(movie_file_path, output_path, predictions, frame_dict, name=cluster_name, max_frame=max_frame) plotter.scatter_plot(cluster_name + '_tsne', tsne_features, predictions[tsne_chosen_samples]) plotter.scatter_plot(cluster_name + '_pca', pca_features, predictions[pca_chosen_samples])
if __name__ == '__main__': has_ids_list = True print("removing empty lines in original corpus...") preproc_csv(config.MSVD_CSV_DATA_PATH, config.MSVD_PREPROC_CSV_DATA_PATH) print("loading proccessed corpus...") csv_data = utils.read_csv_data(config.MSVD_PREPROC_CSV_DATA_PATH) print("reading video clips ids...") if not has_ids_list: vid_ids_list = utils.read_dir_files( config.MSVD_VIDEO_DATA_PATH ) # read dataset vid ids from video clips directory else: vid_ids_list = utils.read_file_to_list( config.DATA_DIR + "present_vid_ids.txt") # read dataset vid ids from text file assert len(vid_ids_list) == config.TOTAL_VIDS print("filtering clips in df...") present_vid_ids, missing_vid_ids, present_vid_ids_csv = filter_clips( csv_data, vid_ids_list) assert len(present_vid_ids) == config.TOTAL_VIDS print("saving filtered df...") df = clean_caps_df(csv_data, present_vid_ids, present_vid_ids_csv) print("loading final corpus...") csv_data = utils.read_csv_data(config.MSVD_FINAL_CORPUS_PATH) print("splitting corpus into train-val-test...") train_df, val_df, test_df = split_data(csv_data) print("generating vocab for train data...") vocab, _ = gen_vocab(train_df, "train") _, _ = gen_vocab(val_df, "val")