def main(): # Parse command line arguments parser = argparse.ArgumentParser() parser.add_argument('--data_folder', help="Data from the Platform", required=True) parser.add_argument('--trained_model', help="Trained model", required=True) args = parser.parse_args() # Parameters data_folder = args.data_folder trained_model = args.trained_model # Configurations SETUP_PATH = 'configuration_test.yml' configurations = Configurations(SETUP_PATH) eprint(''.join("%s:\t%s\n" % item for item in vars(configurations).items())) # Data get_data_test(data_folder, configurations, trained_model) # Inference # predictions = test_model(X_test, data_folder, trained_model, configurations) # for i in range(5): generate_visuals(data_folder, os.path.join(configurations.output_folder, 'Prediction/'), thold_area=0)
def _recursive_get_urls(crawled_urls, test_page, max_urls, parent_url, domain, depth=0): if depth == 0 or len(crawled_urls) == max_urls: return crawled_urls asyncio.get_event_loop().run_until_complete(get_page( test_page, parent_url)) html = test_page.source soup = BeautifulSoup(html, features='html.parser') urls = soup.findAll('a') for a in set(urls): url = a.get('href') if url is None: continue if url.startswith('/'): url = parent_url.rstrip('/') + url if urlparse(url).netloc == domain and url not in crawled_urls: if len(crawled_urls) <= max_urls: crawled_urls.append(url) eprint('[LOG] Added: {}'.format(url)) _recursive_get_urls(crawled_urls, max_urls, url, domain, depth - 1)
def predict(image_file): model_path = os.path.join('inference/model_files', 'frednetv2.pth') if not os.path.exists(model_path): eprint("[ERR] Model file does not exist") exit(4) model = NNet() model.load_state_dict(torch.load(model_path, map_location='cpu')) model.eval() with torch.no_grad(): pilim = Image.open(image_file).convert('L').convert('RGB') pilim = preprocess_pilim(pilim) input_array = prepare_for_input(pilim, flip_lr=False) lr_input_array = prepare_for_input(pilim, flip_lr=True) try: out_array = get_output(model(get_tensor(input_array))) except: exit(2) lr_out_array = np.fliplr(get_output(model(get_tensor(lr_input_array)))) out_array = (out_array + lr_out_array) / 2 out_array = threshold_output(out_array, 0.5) out_array *= 255 out_array = np.array(out_array, dtype='uint8') return out_array
def get_recursive_urls(parent_url, max_depth, max_urls): scraped_urls = [parent_url] domain = urlparse(parent_url).netloc page = MyPage() asyncio.get_event_loop().run_until_complete(get_page(page, parent_url)) _recursive_get_urls(scraped_urls, page, max_urls, parent_url, domain, depth=max_depth) eprint('[LOG] Finished crawling URLs for {}'.format(parent_url)) return scraped_urls
def get_data_test(data_folder, configurations, trained_model): # Parameters IMG_WIDTH = configurations.size_img IMG_HEIGHT = configurations.size_img IMG_CHANNELS = 3 TEST_PATH = data_folder COUNT = configurations.sample_count # Path of Image Tiles and Masks path = os.path.join(TEST_PATH, "img") # path_mask = os.path.join(TEST_PATH, "mask") # total = int(sum([len(files) for r, d, files in os.walk(path)])) eprint( f'[DEBUG][get_data_test] Getting and Resizing({IMG_WIDTH}x{IMG_HEIGHT}) Test Images and Masks... ' ) # Get and resize Test images and masks # test_cpt = int(sum([len(files) for r, d, files in os.walk(path)])) # X_test = np.ndarray((test_cpt, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.float32) # Y_test = np.ndarray((test_cpt, IMG_HEIGHT, IMG_WIDTH, 1), dtype=np.float32) # dtype=np.bool) eprint( f'[DEBUG][get_data_test] Getting and Resizing Test Images and Masks Done!\nPath to img: {path}' ) sys.stdout.flush() _, _, files_orj = next(os.walk(path)) # _, _, files_mask = next(os.walk(path_mask)) files_orj = sorted(files_orj) # files_mask = sorted(files_mask) eprint(f'[DEBUG][get_data_test] Number of Image Tiles: {len(files_orj)}') # for i, f in enumerate(files_orj[:COUNT]): # img = cv2.imread(os.path.join(path, f)) # img = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH), interpolation=cv2.INTER_AREA) # img = img / 255 # X_test[i] = img # for i, fm in enumerate(files_mask[:COUNT]): # img_mask = cv2.imread(os.path.join(path_mask, fm), cv2.IMREAD_GRAYSCALE) # img_mask = cv2.resize(img_mask, (IMG_HEIGHT, IMG_WIDTH), interpolation=cv2.INTER_AREA) # img_mask = img_mask / 255 # img_mask = np.expand_dims(img_mask, axis=-1) # Y_test[i] = img_mask # Load Trained Model model = load_model(trained_model, custom_objects={ 'dice_coef': dice_coef, 'dice_coef_loss': dice_coef_loss }) for i, f in enumerate(files_orj): X_test = np.ndarray((1, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.float32) img = cv2.imread(os.path.join(path, f)) img = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH), interpolation=cv2.INTER_AREA) img = img / 255 X_test[0] = img # predictions = test_model(X_test, data_folder, trained_model, configurations) # Predict preds_test = model.predict(X_test) preds_reshaped = np.ndarray((1, IMG_HEIGHT, IMG_WIDTH), dtype=np.float32) preds_reshaped[0] = preds_test[0].reshape(IMG_HEIGHT, IMG_WIDTH) preds_upsampled = [ np.expand_dims(cv2.resize(preds_reshaped[0], (IMG_HEIGHT, IMG_WIDTH)), axis=-1) ] print("[INFO] Upsampling is done!(upsampled to ({}, {}) from ({}, {})". format(IMG_HEIGHT, IMG_WIDTH, preds_test[0].shape[0], preds_test[0].shape[1])) output_pred = os.path.join(configurations.output_folder, 'Prediction') mkdir_if_not_exist(configurations.output_folder) mkdir_if_not_exist(output_pred) theshold_pred = 0.5 img = preds_upsampled[0].copy() img_raw = img * 255 out_name_raw = os.path.join(output_pred, "pred-raw-" + files_orj[i]) cv2.imwrite(out_name_raw, img_raw) img[img > theshold_pred] = 1 img[img <= theshold_pred] = 0 img *= 255 out_name = os.path.join(output_pred, "pred-" + files_orj[i]) cv2.imwrite(out_name, img) print('[INFO] Finished Prediction!')
def get_data(configurations, data_folder): # Write Directory dir_write = os.path.join( configurations.dir_write, '/Run_Train_' + configurations.model_name + '_' + str(current_time)) dir_pred = os.path.join(dir_write, 'Pred_imgs') dir_model = os.path.join(dir_write, 'Model') dir_log = os.path.join(dir_write, 'Log') if not os.path.exists(dir_write): os.makedirs(dir_write) os.makedirs(dir_pred) os.makedirs(dir_model) os.makedirs(dir_log) IMG_WIDTH = configurations.size_img IMG_HEIGHT = configurations.size_img IMG_CHANNELS = 3 TRAIN_PATH = data_folder # Path of Image Tiles and Masks print(data_folder) path = os.path.join(TRAIN_PATH, "img") path_mask = os.path.join(TRAIN_PATH, "mask") path_bud_info = os.path.join(TRAIN_PATH, "Bud_Info") eprint( f'[DEBUG][get_data] Getting and Resizing({IMG_WIDTH}x{IMG_HEIGHT}) Train Images and Masks... ' ) # Get and resize train images and masks train_cpt = int( sum([len(files) for r, d, files in os.walk(TRAIN_PATH + "img/")])) eprint( f'[DEBUG][get_data] Getting and Resizing Train Images and Masks Done!\nPath to img: {path}' ) sys.stdout.flush() _, _, files_orj = next(os.walk(path)) _, _, files_mask = next(os.walk(path_mask)) files_orj = sorted(files_orj) files_mask = sorted(files_mask) eprint( f'[DEBUG][get_data] Number of Image Tiles: {len(files_orj)}\t Number of Image Masks: {len(files_mask)}\n' ) train_cpt_filtered = len(files_orj) files_orj_filtered = files_orj files_mask_filtered = files_mask if int(configurations.thold_tbud) > 0: train_cpt_filtered = 0 files_orj_filtered = [] files_mask_filtered = [] for i, f in enumerate(files_orj): # Apply Bud Threshold if filter_tbud_count(path_bud_info, f, int(configurations.thold_tbud)): train_cpt_filtered += 1 files_orj_filtered.append(files_orj[i]) files_mask_filtered.append(files_mask[i]) X_train = np.ndarray( (train_cpt_filtered, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.float32) Y_train = np.ndarray((train_cpt_filtered, IMG_HEIGHT, IMG_WIDTH, 1), dtype=np.float32) # dtype=np.bool) for i, f in enumerate(files_orj_filtered): # # Apply Bud Threshold # if not filter_tbud_count(path_bud_info, f, configurations.thold_tbud): # continue img = cv2.imread(os.path.join(path, f)) img = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH), interpolation=cv2.INTER_AREA) img = img / 255 X_train[i] = img for i, fm in enumerate(files_mask_filtered): # # Apply Bud Threshold # if not filter_tbud_count(path_bud_info, fm, configurations.thold_tbud): # continue img_mask = cv2.imread(os.path.join(path_mask, fm), cv2.IMREAD_GRAYSCALE) img_mask = cv2.resize(img_mask, (IMG_HEIGHT, IMG_WIDTH), interpolation=cv2.INTER_AREA) img_mask = img_mask / 255 img_mask = np.expand_dims(img_mask, axis=-1) Y_train[i] = img_mask eprint( f'[DEBUG][get_data] After Filter thold_tbud:{configurations.thold_tbud} Number of Image Tiles: {len(X_train)}\t Number of Image Masks: {len(Y_train)}\n' ) eprint( f"[DEBUG][INFO] Data Matrix: {round(X_train.nbytes / (1024 * 1000.0),3)} MB\n" ) pixels = Y_train.flatten().reshape(train_cpt_filtered, IMG_HEIGHT * IMG_WIDTH) weights_train = pixels.copy() pixels = np.expand_dims(pixels, axis=-1) eprint(f"Data Read is Done!") return X_train, pixels
def train_model(X, y, configurations): # Parameters - IMG IMG_HEIGHT = int(configurations.size_img) IMG_WIDTH = int(configurations.size_img) IMG_CHANNELS = 3 # Parameters - Model lr_rate = float(configurations.learning_rate) model_name = str(configurations.model_name) model_type = str(configurations.model_type) dir_write = mkdir_if_not_exist(str(configurations.dir_write)) activation = str(configurations.activation) batch_size = int(configurations.batch_size) epochs = int(configurations.epoch) dropout_ratio = float(configurations.dropout_ratio) dropout_level = int(configurations.dropout_level) model_string = str(configurations.model_string) eprint(f"[INFO][train_model] {model_string}") # Free up RAM in case the model definition cells were run multiple times K.clear_session() # Stop training when a monitoring quantity has stopped improving # earlystopper = EarlyStopping(monitor='val_loss', patience=100, verbose=1) # Initialize the model if model_type.lower() == 'resunet': model = unetModel_residual(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS, dropout_ratio=dropout_ratio, \ lr_rate=lr_rate, activation=activation, dropout_level=dropout_level) else: model = unetModel_basic_4(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS, dropout_ratio=dropout_ratio, \ lr_rate=lr_rate, activation=activation, dropout_level=dropout_level) # Save the model after every epoch checkpointer = ModelCheckpoint(dir_write + "/" + model_string + '_main_modelCheckpoint.h5', verbose=0, monitor='val_loss', \ save_best_only=True, save_weights_only=False, period=1, mode='auto') # Log training csv_logger = CSVLogger('{}/log_{}.training.csv'.format( dir_write, model_string)) # Reduce lr_rate on plateau reduce_lr = ReduceLROnPlateau(monitor='val_dice_coef', factor=0.5, patience=10, verbose=0, mode='max', cooldown=1, min_lr=0.000001) # Early stopping with patience earlystopping = EarlyStopping(monitor='val_dice_coef', patience=25, mode='max') # Fit model eprint("[INFO][train_model] Model Fit...") results = model.fit( X, y, validation_split=0.2, batch_size=batch_size, epochs=epochs, callbacks=[checkpointer, csv_logger, reduce_lr, earlystopping], verbose=1, shuffle=True) #, sample_weight=weights_train) eprint("[INFO][train_model] Model Fit Done!") # Write model history to the file pd.DataFrame(results.history).to_csv(dir_write + "history_" + model_string + ".csv") return model, results
def work(baseline_dir, updated_dir, prefix): baseline_dir = os.path.join("./tmp", baseline_dir) updated_dir = os.path.join("./tmp", updated_dir) images = sorted([ image for image in os.listdir(baseline_dir) if image.endswith('.png') ]) scores_dict = {} for i, image in enumerate(images): mask_matches = [] baseline_image_path = os.path.join(baseline_dir, image) updated_image_path = os.path.join(updated_dir, image) baseline_image = load_image_helper(baseline_image_path) updated_image = load_image_helper(updated_image_path) eprint('[LOG] Making prediction for baseline - {}, image - {}'.format( prefix, image)) baseline_image_mask = predict(baseline_image_path) eprint('[LOG] Saving masks for baseline - {}'.format(prefix)) save_masks(baseline_dir, image, baseline_image_mask, baseline_image) eprint('[LOG] Making prediction for updated - {}, image - {}'.format( prefix, image)) updated_image_mask = predict(updated_image_path) eprint('[LOG] Saving masks for updated - {}'.format(prefix)) save_masks(updated_dir, image, updated_image_mask, updated_image) eprint('[LOG] Finished predictions') if baseline_image.shape != updated_image.shape: eprint('[LOG] Images have different shapes. Using DP algo') for c in range(0, 5): mask_matches.append( match_images(baseline_image_mask[:, :, c], updated_image_mask[:, :, c], STEP)) eprint('[LOG] Calculating mask divergence score for {}, image - {}'. format(prefix, image)) mask_divergence_scores = Scores.diff_mask_divergence( baseline_image_mask // 255, updated_image_mask // 255, mask_matches) eprint( '[LOG] Calculating pixelwise divergence score for {}, image - {}'. format(prefix, image)) pixelwise_divergence_scores = Scores.diff_pixelwise_divergence( baseline_image, updated_image, baseline_image_mask // 255, updated_image_mask // 255, mask_matches) baseline_js_log_file = os.path.join( baseline_dir, image.split('.')[0] + "_js_log.json") updated_js_log_file = os.path.join( updated_dir, image.split('.')[0] + "_js_log.json") baseline_network_log_file = os.path.join( baseline_dir, image.split('.')[0] + "_network_log.json") updated_network_log_file = os.path.join( updated_dir, image.split('.')[0] + "_network_log.json") log_processor = LogProcessor(baseline_js_log_file, updated_js_log_file, baseline_network_log_file, updated_network_log_file) result = log_processor.run() ui_risk_score = max(mask_divergence_scores['overall'], pixelwise_divergence_scores['overall']) scores_dict[i + 1] = { 'ui_stats': { 'mask_div': mask_divergence_scores, 'pixelwise_div': pixelwise_divergence_scores, 'risk_score': ui_risk_score }, 'js_stats': result['javascript'], 'network_stats': result['network'], 'risk_score': result['risk_score'] } with open(os.path.join('./tmp', prefix + '_scores.json'), 'w') as f: json.dump(scores_dict, f, indent=2) eprint('[LOG] Saved scores dictionary for {}'.format(prefix))
def work(baseline_url, updated_url, max_depth, max_urls, prefix, auth_baseline_username, auth_baseline_password, auth_updated_username, auth_updated_password): baseline_url = add_auth(url=baseline_url, username=auth_baseline_username, password=auth_baseline_password) updated_url = add_auth(url=updated_url, username=auth_updated_username, password=auth_updated_password) crawled_baseline = get_recursive_urls(baseline_url, max_depth, max_urls)[:max_urls] crawled_upgraded = get_recursive_urls(updated_url, max_depth, max_urls)[:max_urls] baseline_domain = get_domain(baseline_url) updated_domain = get_domain(updated_url) crawled_baseline_paths = [get_path(path) for path in crawled_baseline] crawled_updated_paths = [get_path(path) for path in crawled_upgraded] all_paths = list(set(crawled_baseline_paths) | set(crawled_updated_paths)) ss_report = {} for i, path in enumerate(all_paths): eprint('[LOG] Taking screenshots for {} - {}'.format(prefix, path)) collect_data(baseline_domain + path, prefix + '_baseline', '{}.png'.format(i + 1)) collect_data(updated_domain + path, prefix + '_updated', '{}.png'.format(i + 1)) ss_report[i + 1] = { 'baseline': baseline_domain + path, 'updated': updated_domain + path, 'endpoint': path, 'baseline_assets': 'tmp/' + prefix + "_baseline/", 'updated_assets': 'tmp/' + prefix + "_updated/" } eprint('[LOG] Finished taking screenshots for {}'.format(prefix)) with open(os.path.join('./tmp', prefix + '_ss_report.json'), 'w') as f: json.dump(ss_report, f, indent=2) p = Popen([ 'python3', 'worker_predict.py', '--baseline-dir', prefix + '_baseline', '--updated-dir', prefix + '_updated', '--prefix', prefix ]) if p.poll() is not None and p.poll() > 0: eprint('[ERR] Failed to launch inference process') exit(3) eprint('[LOG] Waiting for {}'.format(prefix)) p.wait() if p.poll() != 0: eprint('[ERR] Prediction script failed for {}'.format(prefix)) exit(p.poll()) eprint('[LOG] Finished prediction for {}'.format(prefix)) ui_risk_scores = [] network_risk_scores = [] js_stats_total = [] net_stats_total = [] pixelwise_div_total = [] mask_div_total = [] with open(os.path.join('./tmp', prefix + '_report.json'), 'w') as f: scores_report = json.load( open(os.path.join('./tmp', prefix + '_scores.json'))) screenshots_report = json.load( open(os.path.join('./tmp', prefix + '_ss_report.json'))) page_report = {} for i in range(1, len(all_paths) + 1): page_report[i] = scores_report[str(i)] js_stats_total.append(scores_report[str(i)]["js_stats"]) net_stats_total.append(scores_report[str(i)]["network_stats"]) page_report[i]['links'] = screenshots_report[str(i)] ui_risk_scores.append(page_report[i]["ui_stats"]["risk_score"]) network_risk_scores.append(page_report[i]["risk_score"]) pixelwise_div_total.append( page_report[i]['ui_stats']['pixelwise_div']) mask_div_total.append(page_report[i]['ui_stats']['mask_div']) page_report['risk_score'] = max(max(ui_risk_scores), max(network_risk_scores)) page_report['js_stats'] = dsum(js_stats_total) page_report['ui_stats'] = { 'pixelwise_div_mean': dsum(pixelwise_div_total, True), 'mask_div_mean': dsum(mask_div_total, True), 'pixelwise_div_std': dstd(pixelwise_div_total), 'mask_div_std': dstd(mask_div_total) } page_report['network_stats'] = dsum(net_stats_total) json.dump(page_report, f, indent=4) eprint('[LOG] Saved {} report to {}'.format(prefix, prefix + '_report.json')) os.remove(os.path.join('./tmp', prefix + '_scores.json')) os.remove(os.path.join('./tmp', prefix + '_ss_report.json')) exit(0)