def __fetch_intent_actions(self): """ :return: 0 success 1 failure """ self.intent_actions = [] output_actions_txt_path = os.path.join(self.dst_output_path, 'actions.txt') if not os.path.exists(output_actions_txt_path): if extract_spec_list_from_file( self.intent_actions, self.am_processed_path, EXTRACT_SPECS['ACTION']) != STATUS_OK: return STATUS_ERR write_list_to_file(self.intent_actions, output_actions_txt_path) if (not self.intent_actions) and read_file_to_list( self.intent_actions, output_actions_txt_path) != STATUS_OK: return STATUS_ERR else: if self.include_intent_actions_126: get_filtered_vector( self.feature_list, self.intent_actions, CONSTANTS['INTENT_ACTIONS_126']['REFERENCE_LIST']) return STATUS_OK elif self.include_intent_actions_110: get_filtered_vector( self.feature_list, self.intent_actions, CONSTANTS['INTENT_ACTIONS_110']['REFERENCE_LIST']) return STATUS_OK
def __fetch_sensitive_apis(self): """ :return: 0 success 1 failure """ self.sensitive_apis = [] output_apis_txt_path = os.path.join(self.dst_output_path, 'apis.txt') if not os.path.exists(output_apis_txt_path): smali_search_result = glob.glob(os.path.join( self.smali_dir_path, "**\\*.smali"), recursive=True) for smali_file in smali_search_result: if extract_sensitive_apis_list_from_smali( self.sensitive_apis, smali_file) != STATUS_OK: print('extract apis failed') return STATUS_ERR write_list_to_file(self.sensitive_apis, output_apis_txt_path) if (not self.sensitive_apis) and read_file_to_list( self.sensitive_apis, output_apis_txt_path) != STATUS_OK: return STATUS_ERR else: get_filtered_vector( self.feature_list, self.sensitive_apis, CONSTANTS['SENSITIVE_APIS_106']['REFERENCE_LIST']) return STATUS_OK
def normalize_coordinates(tile_fnames_or_dir, output_dir, jar_file): all_files = [] for file_or_dir in tile_fnames_or_dir: if not os.path.exists(file_or_dir): print("{0} does not exist (file/directory), skipping".format(file_or_dir)) continue if os.path.isdir(file_or_dir): actual_dir_files = glob.glob(os.path.join(file_or_dir, '*.json')) all_files.extend(actual_dir_files) else: all_files.append(file_or_dir) if len(all_files) == 0: print "No files for normalization found. Exiting." return print "Normalizing coordinates of {0} files".format(all_files) files_urls = [] for file_name in all_files: tiles_url = utils.path2url(file_name) files_urls.append(tiles_url) list_file = os.path.join(output_dir, "all_files.txt") print "list_file", list_file utils.write_list_to_file(list_file, files_urls) list_file_url = utils.path2url(list_file) java_cmd = 'java -Xmx3g -XX:ParallelGCThreads=1 -Djava.awt.headless=true -cp "{0}" org.janelia.alignment.NormalizeCoordinates --targetDir {1} {2}'.format( jar_file, output_dir, list_file_url) utils.execute_shell_command(java_cmd)
def prepare_data_ids(vid_caps_path, ids_save_path): vid_caps_dict = utils.read_from_json(vid_caps_path) data_ids = [] for vid_caps in vid_caps_dict.items(): vid_id = vid_caps[0] if vid_id[-4:] == ".avi": vid_id = vid_id[:-4] for seq_id in range(len(vid_caps[1])): data_id = vid_id + "|" + str(seq_id) data_ids.append(data_id) utils.write_list_to_file(ids_save_path, data_ids)
def run(from_scratch, scroll_down_count): chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome(utils.get_chromedriver_path(), chrome_options=chrome_options) driver.maximize_window() cities_list = cities_extraction(driver, from_scratch, scroll_down_count) logger.info(config.MSG_DICT["CITIES_FOUND_COUNT"].format(len(cities_list))) utils.write_list_to_file(config.CITIES_FILENAME, cities_list) time.sleep(config.GENERAL_WAITER) driver.quit()
def generate_dup2_commands(newfiles): renumberfilename = freerel_out["renumberfilename"] paths = ["tasks", "filter", "duplicates2", "dup2"] program_to_run = cadoprograms.Duplicates2 progparams = parameters.myparams(program_to_run.get_accepted_keys(), paths) progparams.pop("renumber", None) commands = [] for i, (filename, rels) in enumerate(newfiles): if filename in generate_dup2_commands.slice_files[i]: continue # nothing to do logger.info("Dup2: Processing slice %d", i) generate_dup2_commands.slice_rels[i] += rels rels = generate_dup2_commands.slice_rels[i] generate_dup2_commands.slice_files[i].add(filename) files = list(generate_dup2_commands.slice_files[i]) if len(files) <= 10: program = cadoprograms.Duplicates2(*files, rel_count=rels, renumber=renumberfilename, **progparams) else: filelist = utils.write_list_to_file(files, dup1dir) program = cadoprograms.Duplicates2(filelist=filelist, rel_count=rels, renumber=renumberfilename, **progparams) command = program.make_command_line() cmd_logger.debug(command) commands.append(command) return commands
def generate_dup1_command(newfiles): paths = ["tasks", "filter", "duplicates1", "dup1"] program_to_run = cadoprograms.Duplicates1 progparams = parameters.myparams(program_to_run.get_accepted_keys(), paths) progparams.pop("prefix", None) progparams.pop("out", None) prefix = "dup1.%d" % (time.time()) if generate_dup1_command.nr_slices is None: # first run generate_dup1_command.nr_slices = 2 # default to 2**1 if "nslices_log" in progparams: generate_dup1_command.nr_slices = 2 ** progparams["nslices_log"] for i in range(0, generate_dup1_command.nr_slices): try: os.makedirs(os.path.join(dup1dir, str(i))) except OSError as exception: if exception.errno != errno.EEXIST: raise # might want to improve this and maintain status of all files # (in case a dup1 run fails) so that a later run can try again if len(newfiles) <= 10: program = cadoprograms.Duplicates1(*newfiles, prefix=prefix, out=dup1dir, **progparams) else: filelist = utils.write_list_to_file(newfiles, dup1dir) program = cadoprograms.Duplicates1(filelist=filelist, prefix=prefix, out=dup1dir, **progparams) command = program.make_command_line() cmd_logger.debug(command) return command
def clean_caps_df(csv_data, present_vid_ids, present_vid_ids_csv): vid_list = list(set([get_vid_ids(s) for s in present_vid_ids])) assert len(vid_list) == len(present_vid_ids_csv) df = csv_data.loc[((csv_data['VideoID'].isin(vid_list)) & (csv_data['Language'] == 'English')) & csv_data['Description'].notnull()] df.to_csv(config.MSVD_FINAL_CORPUS_PATH, index=False, encoding='utf-8') df = utils.read_csv_data(config.MSVD_FINAL_CORPUS_PATH) omitted_caps = [] punct_dict = get_punctuations() translator = string.maketrans("", "") df['Description'] = df.apply(lambda row: clean_caps( row['Description'], punct_dict, translator, omitted_caps), axis=1) df = df.loc[df['Description'].notnull()] df.to_csv(config.MSVD_FINAL_CORPUS_PATH, index=False, encoding='utf-8') print("Non-ASCII captions omitted :" + str(len(omitted_caps))) utils.write_list_to_file(config.MSVD_OMMITTED_CAPS_PATH, omitted_caps) return df
def __fetch_permissions(self): """ :return: 0 success 1 failure """ self.permissions = [] output_permissions_txt_path = os.path.join(self.dst_output_path, 'permissions.txt') if not os.path.exists(output_permissions_txt_path): if extract_spec_list_from_file( self.permissions, self.am_processed_path, EXTRACT_SPECS['PERMISSION']) != STATUS_OK: return STATUS_ERR write_list_to_file(self.permissions, output_permissions_txt_path) if (not self.permissions) and read_file_to_list( self.permissions, output_permissions_txt_path) != STATUS_OK: return STATUS_ERR else: get_filtered_vector(self.feature_list, self.permissions, CONSTANTS['PERMISSIONS_147']['REFERENCE_LIST']) return STATUS_OK
def filter_clips(csv_data, vid_clips_list): pf = [] mf = [] tl = 0 pvids = [] for index, row in csv_data.iterrows(): fname = str(row["VideoID"]) + "_" + str(row["Start"]) + "_" + str( row["End"]) + ".avi" if fname in vid_clips_list: if fname not in pf: pf.append(fname) tl += 1 if row["VideoID"] not in pvids: pvids.append(row["VideoID"]) else: if fname not in mf: mf.append(fname) tl += 1 utils.write_list_to_file(config.DATA_DIR + "present_vid_ids.txt", pf) utils.write_list_to_file(config.DATA_DIR + "missing_vid_ids.txt", mf) utils.write_list_to_file(config.DATA_DIR + "present_csv_vid_ids.txt", pvids) print("Present : {}".format(len(pf))) print("Missing : {}".format(len(mf))) print("Total (from CSV): {}".format(tl)) return pf, mf, pvids
def generate_purge_command(dup2_out): paths = ["tasks", "filter", "purge", "purge"] program_to_run = cadoprograms.Purge progparams = parameters.myparams(program_to_run.get_accepted_keys(), paths) nfree = freerel_out["nfree"] nunique = sum(dup2_out) input_nrels = nfree + nunique nprimes = freerel_out["nprimes"] minindex = int(progparams.get("col_minindex", -1)) if minindex == -1: minindex = int(nprimes / 20.0) # For small cases, we want to avoid degenerated cases, so let's # keep most of the ideals: memory is not an issue in that case. if (minindex < 10000): minindex = 500 progparams.setdefault("col_minindex", minindex) keep = progparams.pop("keep", None) relsdelfile = None # not supporting dlp yet files = [freerel_out["freerelfilename"]] for i in range(generate_dup1_command.nr_slices): files += list(generate_dup2_commands.slice_files[i]) if len(files) <= 10: program = cadoprograms.Purge(*files, nrels=input_nrels, out=purgedfile, outdel=relsdelfile, keep=keep, nprimes=nprimes, **progparams) else: filelist = utils.write_list_to_file(files, dup1dir) program = cadoprograms.Purge(nrels=input_nrels, out=purgedfile, outdel=relsdelfile, keep=keep, nprimes=nprimes, filelist=filelist, **progparams) command = program.make_command_line() cmd_logger.debug(command) return command, purgedfile
def generate_dup1_command(newfiles): paths = ["tasks", "filter", "duplicates1", "dup1"] program_to_run = cadoprograms.Duplicates1 progparams = parameters.myparams(program_to_run.get_accepted_keys(), paths) progparams.pop("prefix", None) progparams.pop("out", None) prefix = "dup1.%d" % (time.time()) if generate_dup1_command.nr_slices is None: # first run generate_dup1_command.nr_slices = 2 # default to 2**1 if "nslices_log" in progparams: generate_dup1_command.nr_slices = 2**progparams["nslices_log"] for i in range(0, generate_dup1_command.nr_slices): try: os.makedirs(os.path.join(dup1dir, str(i))) except OSError as exception: if exception.errno != errno.EEXIST: raise # might want to improve this and maintain status of all files # (in case a dup1 run fails) so that a later run can try again if len(newfiles) <= 10: program = cadoprograms.Duplicates1(*newfiles, prefix=prefix, out=dup1dir, **progparams) else: filelist = utils.write_list_to_file(newfiles, dup1dir) program = cadoprograms.Duplicates1(filelist=filelist, prefix=prefix, out=dup1dir, **progparams) command = program.make_command_line() cmd_logger.debug(command) return command
def split_data(csv_data): vid_ids = utils.read_file_to_list(config.MSVD_VID_IDS_ALL_PATH) assert len(vid_ids) == config.TOTAL_VIDS utils.shuffle_array(vid_ids) train_ids = vid_ids[0:1200] val_ids = vid_ids[1200:1300] test_ids = vid_ids[1300:1970] assert len(train_ids) == config.TRAIN_VIDS assert len(val_ids) == config.VAL_VIDS assert len(test_ids) == config.TEST_VIDS utils.write_list_to_file(config.MSVD_VID_IDS_TRAIN_PATH, train_ids) utils.write_list_to_file(config.MSVD_VID_IDS_VAL_PATH, val_ids) utils.write_list_to_file(config.MSVD_VID_IDS_TEST_PATH, test_ids) train_df = filter_df(csv_data, train_ids, config.MSVD_FINAL_CORPUS_TRAIN_PATH) val_df = filter_df(csv_data, val_ids, config.MSVD_FINAL_CORPUS_VAL_PATH) test_df = filter_df(csv_data, test_ids, config.MSVD_FINAL_CORPUS_TEST_PATH) return train_df, val_df, test_df
# Merge the multiple mfovs pmcc match files into one per direction pmcc_fname = os.path.join(matched_pmcc_dir, "{0}_{1}_match_pmcc.json".format(fname1_prefix, fname2_prefix)) j += 1 matched_after_layers += 1 print "all_pmcc_files: {0}".format(all_pmcc_files) # Create a single file that lists all tilespecs and a single file that lists all pmcc matches (the os doesn't support a very long list) ts_list_file = os.path.join(args.workspace_dir, "all_ts_files.txt") write_list_to_file(ts_list_file, all_ts_files) pmcc_list_file = os.path.join(args.workspace_dir, "all_pmcc_files.txt") write_list_to_file(pmcc_list_file, all_pmcc_files) # Optimize all layers to a single 3d image sections_opt_outputs = [] for i in all_layers: out_section = os.path.join(post_optimization_dir, '{}_{}'.format(str(i).zfill(4), os.path.basename(layers_data[str(i)]['ts']))) sections_opt_outputs.append(out_section) dependencies = list(all_running_jobs) job_optimize = OptimizeLayersElastic(dependencies, sections_opt_outputs, [ ts_list_file ], [ pmcc_list_file ], post_optimization_dir, args.max_layer_distance, conf_fname=args.conf_file_name, skip_layers=args.skip_layers, threads_num=4) all_running_jobs.append(job_optimize)
bbox1 = BoundingBox.fromList(ts1["bbox"]) bbox2 = BoundingBox.fromList(ts2["bbox"]) if bbox1.overlap(bbox2): imageUrl1 = ts1["mipmapLevels"]["0"]["imageUrl"] imageUrl2 = ts2["mipmapLevels"]["0"]["imageUrl"] tile_fname1 = os.path.basename(imageUrl1).split('.')[0] tile_fname2 = os.path.basename(imageUrl2).split('.')[0] print "Matching features of tiles: {0} and {1}".format(imageUrl1, imageUrl2) index_pair = [idx1, idx2] match_json = os.path.join(args.workspace_dir, "{0}_sift_matches_{1}_{2}.json".format(tiles_fname_prefix, tile_fname1, tile_fname2)) # match the features of overlapping tiles if not os.path.exists(match_json): match_single_sift_features_and_filter(args.tiles_fname, all_features[imageUrl1], all_features[imageUrl2], match_json, index_pair, conf_fname=args.conf_file_name) all_matched_features.append(match_json) print 'features matching took {0:1.4f} seconds'.format(time.time() - start_time) # Create a single file that lists all tilespecs and a single file that lists all pmcc matches (the os doesn't support a very long list) matches_list_file = os.path.join(args.workspace_dir, "all_matched_sifts_files.txt") write_list_to_file(matches_list_file, all_matched_features) # optimize the 2d layer montage if not os.path.exists(args.output_file_name): print "Optimizing section in tilespec: {}".format(args.tiles_fname) start_time = time.time() optimize_2d_mfovs(args.tiles_fname, matches_list_file, args.output_file_name, args.conf_file_name) print '2D Optimization took {0:1.4f} seconds'.format(time.time() - start_time)
raise NotImplementedError() encoded_video = np.loadtxt(encoded_feats_path, delimiter=',') print(encoded_video.shape) num, dim = encoded_video.shape assert num == dictsize for vid_id in range(num): vid_feats = encoded_video[vid_id].reshape(32, 1024) # print(vid_feats.shape) np.save(feat_save_path + whichdata + "_" + str(vid_id) + ".npy", vid_feats) if __name__ == '__main__': print("generating vocab for train data...") vocab, _, omitted_caps_train = gen_vocab(config.MURALI_TRAIN_VIDS, "train") _, _, omitted_caps_test = gen_vocab(config.MURALI_TEST_VIDS, "test") omitted_caps = omitted_caps_train + omitted_caps_test utils.write_list_to_file(config.MURALI_MSVD_OMMITTED_CAPS_PATH, omitted_caps) print("generating train data vid+seq ids...") prepare_data_ids(config.MURALI_MSVD_VID_CAPS_TRAIN_PATH, config.MURALI_MSVD_DATA_IDS_TRAIN_PATH) # print("generating val data vid+seq ids...") # prepare_data_ids(config.MURALI_MSVD_VID_CAPS_VAL_PATH, config.MURALI_MSVD_DATA_IDS_VAL_PATH) print("generating test data vid+seq ids...") prepare_data_ids(config.MURALI_MSVD_VID_CAPS_TEST_PATH, config.MURALI_MSVD_DATA_IDS_TEST_PATH) print("seperating train vids encoded features...") save_feats("train") print("seperating test vids encoded features...") save_feats("test")
def main(): # setting parameters parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--mode', type=str, default=None, help='TRAIN or FINETUNE or INFER.') parser.add_argument('--epoches', type=int, default=1000, help='num epoches.') parser.add_argument('--batch_size', type=int, default=50, help='minibatch size.') parser.add_argument('--batch_increase', type=bool, default=True, help='whether to increase the batch_size') parser.add_argument('--num_layers_encoder', type=int, default=2, help='number of encoder layers.') parser.add_argument('--num_layers_decoder', type=int, default=1, help='number of decoder layers.') parser.add_argument( '--embedding_dim', type=int, default=100, help='dimension of the embedding vectors in the embedding matrix.') parser.add_argument('--num_heads', type=int, default=8, help='number of head in multi_heads attention.') parser.add_argument('--rnn_size_encoder', type=int, default=256, help='number of hidden units in encoder.') parser.add_argument('--rnn_size_decoder', type=int, default=256, help='number of hidden units in decoder.') parser.add_argument('--learning_rate', type=float, default=0.001, help='learning rate in every training step.') parser.add_argument('--learning_rate_decay', type=float, default=1, help='only if exponential learning rate is used.') parser.add_argument('--learning_rate_decay_steps', type=int, default=100, help='learning rate decay period.') parser.add_argument('--max_lr', type=float, default=0.01, help='only if cyclic learning rate is used.') parser.add_argument('--label_smoothing', type=float, default=0, help='the label smoothing rate.') parser.add_argument( '--keep_probability_i', type=float, default=1, #0.825 help='values inspired by Jeremy Howard\'s fast.ai course.') parser.add_argument( '--keep_probability_o', type=float, default=1, #0.895 help='values inspired by Jeremy Howard\'s fast.ai course.') parser.add_argument( '--keep_probability_h', type=float, default=1, #0.86 help='values inspired by Jeremy Howard\'s fast.ai course.') parser.add_argument( '--keep_probability_e', type=float, default=1, #0.986 help='values inspired by Jeremy Howard\'s fast.ai course.') # A bug occurred when 0 choosed. Please set beam_width greater than 0 # at infer stage before the problem is resolved. parser.add_argument('--beam_width', type=int, default=1, help='only used in inference, for Beam Search.') parser.add_argument( '--clip', type=int, default=5, help='value to clip the gradients to in training process.') parser.add_argument('--inference_targets', type=int, default=False, help='maximum iterations at decoding period') parser.add_argument('--use_cyclic_lr', type=int, default=False, help='use cyclical learning rates.') parser.add_argument( '--key_words_biasing', type=bool, default=True, help='whether implement the CLAS for key words, default YES') parser.add_argument( '--attention_type', type=str, default='MultiHeadAttention', help='MultiHeadAttention or BahdanauAttention can be selected.') parser.add_argument( '--attention_type_bias', type=str, default='MultiHeadAttention', help='MultiHeadAttention or BahdanauAttention can be selected.') parser.add_argument('--crf_layer', type=bool, default=True, help='if add a crf layer on the decoder outputs.') parser.add_argument( '--dev', type=str, default='cpu', help= 'training by CPU or GPU, input cpu or gpu:0 or gpu:1 or gpu:2 or gpu:3.' ) args = parser.parse_args() ################################################################################## # initital the data, model graph, parameters ################################################################################## print("creating data operator...") # param vocab_create_mode='BUILD' in the first training # the trn files and wav files saved in different folders if args.mode == 'INFER': args.batch_size = 1 data = Corpus(trn_file=TEST_TRN_FILE, wav_file=TEST_WAV_FILE, \ mfcc_file=TEST_OUTPUT_MFCC_FILE, args=args, \ vocab_create_mode='LOAD', mfcc_create='N') else: data = Corpus(trn_file=TRN_FILE, wav_file=WAV_FILE, mfcc_file=OUTPUT_MFCC_FILE, \ args=args, vocab_create_mode='LOAD', mfcc_create='N') print("building model graph...") model = LAS(args, data.vocab) model.build_model() saver = tf.train.Saver() sess = tf.Session() print("initializing parameters...") sess.run(tf.global_variables_initializer()) ################################################################################## # TRAIN or INFERENCE stage ################################################################################## if args.mode == 'TRAIN': ## train with tf.device("/" + str(args.dev)): best_loss = np.inf for epoch in range(args.epoches): ## """attempt to increase the batch_size, increase 10 when the epoches increase 50, ## but the max batch_size should be 100 because of the memory limit.""" if epoch % 50 == 0 and args.batch_increase and (epoch != 0): args.batch_size += 10 if args.batch_size >= 100: args.batch_increase = False avg_loss = iter_epoches(sess, epoch, data, model) # if current loss is smaller than the best if avg_loss < best_loss: best_loss = avg_loss print("best_loss: %6f" % (best_loss)) # save model save_path = saver.save(sess, "save/model.ckpt") elif args.mode == 'FINETUNE': ## train the model base on the parameters of the previous training with tf.device("/" + str(args.dev)): # read model from file saver.restore(sess, "save/model.ckpt") best_loss = np.inf for epoch in range(args.epoches): avg_loss = iter_epoches(sess, epoch, data, model) if avg_loss < best_loss: best_loss = avg_loss # save model print("best_loss: %6f" % (best_loss)) save_path = saver.save(sess, "save/model.ckpt") elif args.mode == 'INFER': with tf.device("/" + str(args.dev)): # read model parameters from file saver.restore(sess, "save/model.ckpt") batches = data.batch_generator() lines = [] wers = [] count = 0 biases = INFERENCE_BIAS bias_seq_len = [len(bias) for bias in biases] biases = data.trans_label_to_index(biases) biases = data.padding(biases, bias_seq_len) while True: count += 1 if count % 1 == 0: print(str(count) + ' finished...') try: mfcc_features, audio_seq_len, labels, label_seq_len, _, _ = \ get_feeds(batches) bias_att_len = [len(biases) for _ in range(len(labels))] feed = { model.audios: mfcc_features, model.audio_sequence_lengths: audio_seq_len, model.bias_ids: biases, model.char_sequence_lengths: label_seq_len, model.bias_sequence_lengths: bias_seq_len, model.bias_attention_lengths: bias_att_len } train_ops = model.sample_words preds = run_train_op(sess, train_ops, feed) for p, label in zip(preds, labels): sen = np.transpose(np.array(p), [1, 0]) line = ' '.join(data.trans_index_to_label(list( sen[0]))) lines.append(line) # calculate the WER wers.append(get_edit_distance(line, label)) except StopIteration: break wer = np.mean(np.array(wers)) print(wer) utils.write_list_to_file('pred/predictions.txt', lines, 'a+')
def extract_roi_from_matlab_annotations(movie_path: str, annotation_path: str, output_path: str, max_frame: int = 100000): if not os.path.exists(output_path): os.mkdir(output_path) # Create video source instance print('Initializing video capture at {}'.format(movie_path)) video_src = Video_Reader(movie_path) _, image = video_src.get_frame() video_src.reset() img_height, img_width, img_channel = image.shape print('Reading annotation at {}'.format(annotation_path)) Annotation_list = bbt.Read_Annotation(annotation_path, (img_width, img_height)) cooccurring_tracks = [] bounding_boxes_list = [] bbx_to_gt_list = [] track_to_gt_list = [] print('Extracting face patches.') frame_idx = 0 bbx_idx = 0 num_frame = min(len(Annotation_list), max_frame) tbar = tqdm.tqdm(range(num_frame)) for j in tbar: ret, image = video_src.get_frame() if not ret: break bounding_boxes = Annotation_list[frame_idx] track_list = [] for bbx in bounding_boxes: cropped_image = image[bbx[1]:bbx[3], bbx[0]:bbx[2], :] cropped_image = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB) cropped_image = Image.fromarray(cropped_image) cropped_image = utils.make_square(cropped_image) cropped_image = cropped_image.resize((160, 160), resample=Image.LANCZOS) track_id = bbx[6] gt_label = bbx[4] bounding_boxes_list.append( [frame_idx, track_id, bbx_idx, bbx[0], bbx[1], bbx[2], bbx[3]]) bbx_to_gt_list.append([bbx_idx, gt_label]) track_to_gt_list.append([track_id, gt_label]) # Save image dir_name = '{:04d}'.format(track_id) image_name = '{:06d}.png'.format(bbx_idx) save_path = os.path.join(output_path, dir_name) if not os.path.exists(save_path): os.mkdir(save_path) save_file_path = os.path.join(save_path, image_name) cropped_image.save(save_file_path) track_list.append(track_id) bbx_idx += 1 # Note co-occurring tracks if len(track_list) > 1: track_list = sorted(track_list) if track_list not in cooccurring_tracks: cooccurring_tracks.append(track_list) frame_idx += 1 # Save co-occurring tracksset utils.write_list_to_file( os.path.join(output_path, "cooccurring_tracks.txt"), cooccurring_tracks) # Save bbx utils.write_list_to_file(os.path.join(output_path, "bbx.txt"), bounding_boxes_list) # Save ground truth utils.write_list_to_file(os.path.join(output_path, "bbx_gt.txt"), bbx_to_gt_list) utils.write_list_to_file(os.path.join(output_path, "track_gt.txt"), track_to_gt_list) print('{} co-occurring tracks.'.format(len(cooccurring_tracks)))
def run_submit(args): augment = ['null'] out_dir = args.out_dir + f'/{args.model_name}' initial_checkpoint = args.initial_checkpoint batch_size = args.batch_size ## setup out_dir os.makedirs(out_dir +'/submit', exist_ok=True) log = Logger() log.open(out_dir+'/log.submit.txt',mode='a') log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) log.write('\t%s\n' % COMMON_STRING) log.write('\n') log.write('\tSEED = %u\n' % SEED) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % out_dir) log.write('\n') log.write('submitting .... @ %s\n'%str(augment)) log.write('initial_checkpoint = %s\n'%initial_checkpoint) log.write('\n') if 1: #save log.write('** dataset setting **\n') files_train = [f'train_image_data_{fid}.feather' for fid in range(4)] data = read_data(args.data_dir, files_train) df = pd.read_csv(args.df_path) valid_split = np.load(args.data_dir + '/valid_b_fold1_15985.npy').tolist() valid_df = df[df['image_id'].isin(valid_split)] test_dataset = KaggleDataset( df = df, data = data, idx = valid_df.index.values, augment = valid_augment, ) log.write('\n') ## net log.write('** net setting **\n') if args.model_name == 'serex50': net = Serex50_Net().cuda() elif args.model_name == 'effnetb3': net = EfficientNet_3().cuda() else: raise NotImplemented net.load_state_dict(torch.load(initial_checkpoint, map_location=lambda storage, loc: storage), strict=True) image_id, truth, probability = do_evaluate(net, test_dataset, batch_size, augment) if 1: #save write_list_to_file (out_dir + '/submit/image_id.txt',image_id) write_pickle_to_file(out_dir + '/submit/probability.pickle', probability) write_pickle_to_file(out_dir + '/submit/truth.pickle', truth) if 1: image_id = read_list_from_file(out_dir + '/submit/image_id.txt') probability = read_pickle_from_file(out_dir + '/submit/probability.pickle') truth = read_pickle_from_file(out_dir + '/submit/truth.pickle') num_test= len(image_id) if 1: recall, avgerage_recall = compute_kaggle_metric(probability, truth) log.write('avgerage_recall : %f\n'%(avgerage_recall)) for i,name in enumerate(TASK_NAME): log.write('%28s %f\n'%(name,recall[i])) log.write('\n')
def extract_roi(movie_path: str, output_path: str, max_frame: int = 100000, tracker_max_age: int = 10): # Create video source instance print('Initializing video capture at {}'.format(movie_path)) video_src = Video_Reader(movie_path) _, image = video_src.get_frame() video_src.reset() my_fastdt = FAST_DT("cpu", tracker_max_age=tracker_max_age) print('Extracting face patches.') image_dict = {} bbx_dict = {} cooccurring_tracks = [] bbx_idx = 0 tbar = tqdm.tqdm(range(max_frame)) for frame_idx in tbar: ret, image = video_src.get_frame() if not ret: break bounding_boxes = my_fastdt.predict(image) for bbx in bounding_boxes: cropped_image = image[bbx[1]:bbx[3], bbx[0]:bbx[2], :] cropped_image = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB) cropped_image = Image.fromarray(cropped_image) cropped_image = utils.make_square(cropped_image) cropped_image = F.resize(cropped_image, size=160, interpolation=1) track_id = bbx[4] # bounding_boxes_list.append([frame_idx, track_id, bbx_idx, bbx[0], bbx[1], bbx[2], bbx[3]]) if track_id not in image_dict.keys(): image_dict[track_id] = [(cropped_image, bbx_idx, frame_idx)] bbx_dict[track_id] = [[ frame_idx, track_id, bbx_idx, bbx[0], bbx[1], bbx[2], bbx[3] ]] else: image_dict[track_id].append( (cropped_image, bbx_idx, frame_idx)) bbx_dict[track_id].append([ frame_idx, track_id, bbx_idx, bbx[0], bbx[1], bbx[2], bbx[3] ]) bbx_idx += 1 # Remove the last samples of each track as they are residual samples from the tracker max age print('Removing residual samples.') track_id_list = list(image_dict.keys()) for track_id in track_id_list: if len(image_dict[track_id]) + 1 < tracker_max_age: image_dict.pop(track_id) bbx_dict.pop(track_id) else: image_dict[track_id] = image_dict[track_id][1:-tracker_max_age] bbx_dict[track_id] = bbx_dict[track_id][1:-tracker_max_age] # Create the bounding_box_list bounding_boxes_list = [] for track_id in bbx_dict.keys(): for bbx in bbx_dict[track_id]: bounding_boxes_list.append(bbx) # Convert the track classed dictionary to a frame classed dictionary print('Creating dataset.') if not os.path.exists(output_path): os.mkdir(output_path) frame_to_track_dict = {} tbar2 = tqdm.tqdm(image_dict.keys()) for track_id in tbar2: for cropped_image, bbx_idx, frame_idx in image_dict[track_id]: if frame_idx not in frame_to_track_dict.keys(): frame_to_track_dict[frame_idx] = [track_id] else: frame_to_track_dict[frame_idx].append(track_id) # Save image dir_name = '{:04d}'.format(track_id) image_name = '{:06d}.png'.format(bbx_idx) save_path = os.path.join(output_path, dir_name) if not os.path.exists(save_path): os.mkdir(save_path) save_file_path = os.path.join(save_path, image_name) cropped_image.save(save_file_path) # Find co-occurring tracks print('Forming co-occurring tracks file.') for frame_idx in frame_to_track_dict.keys(): track_list = [] for track_id in frame_to_track_dict[frame_idx]: track_list.append(track_id) # Note co-occurring tracks if len(track_list) > 1: track_list = sorted(track_list) if track_list not in cooccurring_tracks: cooccurring_tracks.append(track_list) # Save co-occurring tracksset utils.write_list_to_file( os.path.join(output_path, "cooccurring_tracks.txt"), cooccurring_tracks) # Save bbx utils.write_list_to_file(os.path.join(output_path, "bbx.txt"), bounding_boxes_list) print('{} co-occurring tracks.'.format(len(cooccurring_tracks)))
matched_json_basename = "{0}_matches_{1}_{2}.json".format( tiles_fname_prefix, tile_fname1, tile_fname2) matched_json = os.path.join(layer_matched_sifts_dir, matched_json_basename) # match the features of overlapping tiles if not os.path.exists(matched_json): print("Matching sift of tiles: {0} and {1}".format( imageUrl1, imageUrl2)) match_single_sift_features_and_filter( f, sifts_1, sifts_2, matched_json, index_pair) layers_data[slayer]['matched_sifts'].append(matched_json) matches_list_file = os.path.join( layer_matched_sifts_dir, "{}_matched_sifts_files.txt".format(tiles_fname_prefix)) write_list_to_file(matches_list_file, layers_data[slayer]['matched_sifts']) # optimize (affine) the 2d layer matches (affine) opt_montage_json = os.path.join( optimized_2d_dir, "{0}_montaged.json".format(tiles_fname_prefix)) if not os.path.exists(opt_montage_json): print("Optimizing (affine) layer matches: {0}".format(slayer)) optimize_2d_stitching(f, matches_list_file, opt_montage_json) if render_first: render_tile(opt_montage_json) render_first = False a = input("Rendered right?(Yes/No)") if a == 'Yes' or a == 'yes': continue else: sys.exit(1)
def run(): cities = utils.read_list_from_file(config.CITIES_FILENAME) users_list = get_all_users(cities) utils.write_list_to_file(config.USERS_FILENAME, users_list)
# Verify that all the layers are there and that there are no holes all_layers.sort() for i in range(len(all_layers) - 1): if all_layers[i + 1] - all_layers[i] != 1: for l in range(all_layers[i] + 1, all_layers[i + 1]): if l not in skipped_layers: print "Error missing layer {} between: {} and {}".format(l, all_layers[i], all_layers[i + 1]) sys.exit(1) # Normalize the sections print [layer_to_bbox[l] for l in layer_to_bbox.keys()] normalize_coordinates([layer_to_bbox[l] for l in layer_to_bbox.keys()], norm_dir, args.jar_file) norm_list_file = os.path.join(args.workspace_dir, "all_norm_files.txt") write_list_to_file(norm_list_file, all_norm_files) # Render each layer individually for tiles_fname in glob.glob(os.path.join(norm_dir, '*.json')): tiles_fname_prefix = os.path.splitext(os.path.basename(tiles_fname))[0] # read the layer from the file layer = read_layer_from_file(tiles_fname) # Check if it already rendered the files (don't know the output type) render_out_files = glob.glob(os.path.join(args.output_dir, '{0:0>4}_{1}.*'.format(layer, tiles_fname_prefix))) if len(render_out_files) > 0: print "Skipping rendering of layer {}, because found: {}".format(layer, render_out_files)
fold_ft_sentences[fold] = { "all": [], "ob": [], "eb": [], "s2r": [] } append_to_aggreg_dict(fold_ft_sentences[fold], ft_sentences) if fold not in fold_sentences: fold_sentences[fold] = [] fold_sentences[fold].extend(sentences) for type, ft_sents in ft_sentences.items(): utils.write_list_to_file( ft_sents, os.path.join(output_path, sys_name, ".".join([fold, type, "prep.ft"]))) utils.write_json_line_by_line( sentences, os.path.join(output_path, sys_name, fold + ".prep")) for fold, sentences in fold_ft_sentences.items(): for type, ft_sents in sentences.items(): utils.write_list_to_file( ft_sents, os.path.join(output_path, ".".join([fold, type, "prep.ft"]))) for fold, sentences in fold_sentences.items(): utils.write_json_line_by_line( sentences, os.path.join(output_path, fold + ".prep"))
def create_post_filter_jobs(slayer, filtered_ts_fname, layers_data, jobs, matched_sifts_dir, workspace_dir, output_dir, conf_file_name): layer_matched_sifts_intra_dir = os.path.join( matched_sifts_dir, os.path.join(layers_data[slayer]['prefix'], 'intra')) layer_matched_sifts_inter_dir = os.path.join( matched_sifts_dir, os.path.join(layers_data[slayer]['prefix'], 'inter')) create_dir(layer_matched_sifts_intra_dir) create_dir(layer_matched_sifts_inter_dir) # Read the filtered tilespec tiles_fname_prefix = os.path.splitext( os.path.basename(filtered_ts_fname))[0] cur_tilespec = load_tilespecs(filtered_ts_fname) mfovs = set() for ts in cur_tilespec: mfovs.add(ts["mfov"]) # create the intra matched sifts directories for mfov in mfovs: mfov_intra_dir = os.path.join(layer_matched_sifts_intra_dir, str(mfov)) create_dir(mfov_intra_dir) # A map between layer to a list of multiple matches multiple_match_jobs = {} # read every pair of overlapping tiles, and match their sift features jobs_match_intra_mfovs = {} jobs_match_inter_mfovs = [] indices = [] # TODO - use some other method to detect overlapping tiles for pair in itertools.combinations(xrange(len(cur_tilespec)), 2): idx1 = pair[0] idx2 = pair[1] ts1 = cur_tilespec[idx1] ts2 = cur_tilespec[idx2] # if the two tiles intersect, match them bbox1 = BoundingBox.fromList(ts1["bbox"]) bbox2 = BoundingBox.fromList(ts2["bbox"]) if bbox1.overlap(bbox2): imageUrl1 = ts1["mipmapLevels"]["0"]["imageUrl"] imageUrl2 = ts2["mipmapLevels"]["0"]["imageUrl"] tile_fname1 = os.path.basename(imageUrl1).split('.')[0] tile_fname2 = os.path.basename(imageUrl2).split('.')[0] index_pair = [ "{}_{}".format(ts1["mfov"], ts1["tile_index"]), "{}_{}".format(ts2["mfov"], ts2["tile_index"]) ] if ts1["mfov"] == ts2["mfov"]: # Intra mfov job cur_match_dir = os.path.join(layer_matched_sifts_intra_dir, str(ts1["mfov"])) else: # Inter mfov job cur_match_dir = layer_matched_sifts_inter_dir match_json = os.path.join( cur_match_dir, "{0}_sift_matches_{1}_{2}.json".format(tiles_fname_prefix, tile_fname1, tile_fname2)) # match the features of overlapping tiles if not os.path.exists(match_json): print "Matching sift of tiles: {0} and {1}".format( imageUrl1, imageUrl2) # The filter is done, so assumes no dependencies dependencies = [] # Check if the job already exists if ts1["mfov"] == ts2["mfov"]: # Intra mfov job if ts1["mfov"] in jobs[slayer]['matched_sifts'][ 'intra'].keys(): job_match = jobs[slayer]['matched_sifts']['intra'][ ts1["mfov"]] else: job_match = MatchMultipleSiftFeaturesAndFilter( cur_match_dir, filtered_ts_fname, "intra_l{}_{}".format(slayer, ts1["mfov"]), threads_num=4, wait_time=None, conf_fname=conf_file_name) jobs[slayer]['matched_sifts']['intra'][ ts1["mfov"]] = job_match else: # Inter mfov job if jobs[slayer]['matched_sifts']['inter'] is None: job_match = MatchMultipleSiftFeaturesAndFilter( cur_match_dir, filtered_ts_fname, "inter_{}".format(slayer), threads_num=4, wait_time=None, conf_fname=conf_file_name) jobs[slayer]['matched_sifts']['inter'] = job_match else: job_match = jobs[slayer]['matched_sifts']['inter'] job_match.add_job(dependencies, layers_data[slayer]['sifts'][imageUrl1], layers_data[slayer]['sifts'][imageUrl2], match_json, index_pair) #jobs[slayer]['matched_sifts'].append(job_match) layers_data[slayer]['matched_sifts'].append(match_json) # Create a single file that lists all tilespecs and a single file that lists all pmcc matches (the os doesn't support a very long list) matches_list_file = os.path.join( workspace_dir, "{}_matched_sifts_files.txt".format(tiles_fname_prefix)) write_list_to_file(matches_list_file, layers_data[slayer]['matched_sifts']) # optimize (affine) the 2d layer matches (affine) opt_montage_json = os.path.join( output_dir, "{0}_montaged.json".format(tiles_fname_prefix)) if not os.path.exists(opt_montage_json): print "Optimizing (affine) layer matches: {0}".format(slayer) dependencies = [] if jobs[slayer]['matched_sifts']['inter'] is not None: dependencies.append(jobs[slayer]['matched_sifts']['inter']) if jobs[slayer]['matched_sifts']['intra'] is not None and len( jobs[slayer]['matched_sifts']['intra']) > 0: dependencies.extend( jobs[slayer]['matched_sifts']['intra'].values()) job_opt_montage = OptimizeMontageTransform(dependencies, filtered_ts_fname, matches_list_file, opt_montage_json, conf_fname=conf_file_name) layers_data[slayer]['optimized_montage'] = opt_montage_json
def __fetch_function_call_graph(self): """ :return: """ """ Deal with the component corresponding action Component names in the AndroidManifest file are either complete or incomplete. If there is only one word, it is considered incomplete In comp_dict, key is the class name,value is the action feature value All methods in the class inherit this action feature """ comp_dict = {} for comp_match in COMPONENT_PATTERN.finditer(self.am_content): action_list = [] comp_action_features = [] comp_detail = comp_match.group(0) comp_name = comp_match.group('compname') if comp_name.startswith('.'): comp_name = self.package_name + comp_name elif len(comp_name.split('.')) == 1: comp_name = self.package_name + '.' + comp_name class_path = join_class_path(comp_name) for action_match in INTENT_ACTION_PATTERN.finditer(comp_detail): action_list.append(action_match.group('action').split('.')[-1]) get_filtered_vector( comp_action_features, action_list, CONSTANTS['INTENT_ACTIONS_126']['REFERENCE_LIST']) comp_dict[class_path] = np.array(comp_action_features, dtype=np.uint8, ndmin=2) output_func_call_pairs_txt_path = os.path.join(self.dst_output_path, 'func_call_pairs.txt') if not os.path.exists(output_func_call_pairs_txt_path): temp_dict = {} smali_search_result = glob.glob(os.path.join( self.smali_dir_path, "**\\*.smali"), recursive=True) for smali_file in smali_search_result: if extract_func_call_pairs_list_from_smali( temp_dict, smali_file) != STATUS_OK: print('extract func call pairs failed') return STATUS_ERR self.func_call_pairs = list(temp_dict.keys()) write_list_to_file(self.func_call_pairs, output_func_call_pairs_txt_path) temp_dict.clear() if (not self.func_call_pairs) and read_file_to_list( self.func_call_pairs, output_func_call_pairs_txt_path) != STATUS_OK: return STATUS_ERR all_funcs_set = set() for call_pair in self.func_call_pairs: temp_list = call_pair.split(' ') if len(temp_list) == 3: all_funcs_set.add(temp_list[0]) all_funcs_set.add(temp_list[2]) elif len(temp_list) == 2: print('length 2 -> ' + ','.join(temp_list)) elif len(temp_list) == 1: print('length 1 -> ' + ','.join(temp_list)) elif len(temp_list) == 0: print('length 0') else: print('other length ' + str(len(temp_list))) # have a MainNode,,, self.nodes_num = len(list(all_funcs_set)) + 1 if self.nodes_num > 30000: return STATUS_ERR all_funcs_set = None print('nodes num->', self.nodes_num) self.adj_matrix = np.zeros((self.nodes_num, self.nodes_num), dtype=np.uint8) self.node_features = np.zeros((self.nodes_num, 273), dtype=np.uint8) self.node_labels = [] all_funcs = [] api_lv_match = TARGET_SDK_VER_PATTERN.search(self.am_content) if not api_lv_match: api_lv_match = MIN_SDK_VER_PATTERN.search(self.am_content) if api_lv_match and int(api_lv_match.group('apilevel')) >= 16: self.api_level = api_lv_match.group('apilevel') # The construct mainNode is characterized by the entire app, and its tag is the tag of the app,malicious 10 benign 01 all_funcs.append('MainNode') self.node_labels.append( [1, 0]) if self.is_malicious else self.node_labels.append([0, 1]) self.adj_matrix[0] = np.ones((1, self.nodes_num), dtype=np.uint8) self.node_features[0] = np.array(self.feature_list, dtype=np.uint8)[0:273] for call_pair in self.func_call_pairs: temp_list = call_pair.split(' ') if len(temp_list) == 3: caller = temp_list[0] called = temp_list[2] """ Extract by API """ # row :caller| column :called caller_idx = self.__process_func(caller, all_funcs, comp_dict) called_idx = self.__process_func(called, all_funcs, comp_dict) self.adj_matrix[caller_idx, called_idx] = 1 elif len(temp_list) == 2: print('length 2 -> ' + ','.join(temp_list)) elif len(temp_list) == 1: print('length 1 -> ' + ','.join(temp_list)) elif len(temp_list) == 0: print('length 0') else: print('other length ' + str(len(temp_list))) write_list_to_file(all_funcs, os.path.join(self.dst_output_path, 'all_funcs.txt')) return STATUS_OK
"{0}_{1}_filter_ransac.json".format(fname1_prefix, fname2_prefix)) if not os.path.exists(ransac_fname): print "Filter-and-Ransac of layers: {0} and {1}".format(i, i + j) filter_ransac(match_json, path2url(layer_to_ts_json[i]), ransac_fname, args.jar_file, conf) all_model_files.append(ransac_fname) j += 1 matched_after_layers += 1 # Optimize all layers to a single 3d image all_ts_files = layer_to_ts_json.values() create_dir(args.output_dir) ts_list_file = os.path.join(args.workspace_dir, "all_ts_files.txt") write_list_to_file(ts_list_file, all_ts_files) matched_sifts_list_file = os.path.join(args.workspace_dir, "all_matched_sifts_files.txt") write_list_to_file(matched_sifts_list_file, all_matched_sifts_files) model_list_file = os.path.join(args.workspace_dir, "all_model_files.txt") write_list_to_file(model_list_file, all_model_files) optimize_layers_affine([ts_list_file], [matched_sifts_list_file], [model_list_file], fixed_layers, args.output_dir, args.max_layer_distance, args.jar_file, conf, args.skip_layers, manual_matches=args.manual_match)
bbox_and_norm_jobs.append(bbox_job) # Normalize the coordination on all files (at a single execution) normalized_all_files = True for f in norm_files: if not os.path.exists(f): normalized_all_files = False break if not normalized_all_files: norm_job = NormalizeCoordinates(jobs['bbox'], bbox_files, norm_dir, args.jar_file, norm_files) bbox_and_norm_jobs.append(norm_job) norm_list_file = os.path.join(args.workspace_dir, "all_norm_files.txt") write_list_to_file(norm_list_file, norm_files) # Perform the rendering for f in json_files.keys(): # read the layer from the file layer = read_layer_from_file(f) # If the layer in the file is not in the required range, continue to the next file if args.from_layer != -1: if layer < args.from_layer: continue if args.to_layer != -1: if layer > args.to_layer: continue tiles_fname = os.path.basename(f)