def get_clive_files(): input_dir = get_ready_data_dir() l_files = list (sorted(input_dir.iterdir())) l_files = [x for x in l_files if "cline" in x.stem] return l_files
def process_part(iCluster): print(f"process_part {iCluster} starting...") assert get_ready_data_dir().is_dir() output_dir = get_ready_data_dir() v = VideoManager() l_d = v.get_cluster_metadata(iCluster) mtcnn_detector = MTCNNDetector() for entry in l_d: orig_path = entry[0] file_base = output_dir / f"c_{iCluster}_{orig_path.stem}" filename_df = file_base.with_suffix(".pkl") filename_np = file_base.with_suffix(".npy") isJobDone = filename_df.is_file() and filename_np.is_file() if isJobDone: continue print (str(orig_path)) df = sample_video_set(mtcnn_detector, entry) print(f"Saving {str(file_base)}...") df.to_pickle(filename_df) print(f"Preprocessing {str(file_base)}...") data = np.stack(df.data.values) data = preprocess_input(data) np.random.shuffle(data) np.save(filename_np, data) print(f"Videoset {str(file_base)} done.") print(f"Cluster {iCluster} done.")
def example(): path = get_ready_data_dir() border = 30 filename_input = "test" filename_output = "cut_test" num_images = 128 align_frames(path, border, filename_input, filename_output, num_images)
def create_chunks(): output_dir = get_output_dir() ready_dir = get_ready_data_dir() l_files = list(sorted(output_dir.iterdir())) l_files = [x for x in l_files if "npy" in x.suffix] iFile = 0 photos = list() labels = list() for x in l_files: # print (x) anData = np.load(x) video_size = 32 W = 256 H = 1 anData = anData.reshape(-1, video_size, W, 3) anReal = anData[:7] # First fake: anFake = anData[7:14] for i in range(7): photos.append(anReal[i]) labels.append(0.0) photos.append(anFake[i]) labels.append(1.0) isLast = (x == l_files[-1]) if isLast or len(photos) > 5000: photos = np.asarray(photos) labels = np.asarray(labels) photos = photos / 255.0 filepath_photo = ready_dir / f"photos_{iFile:04}.npy" filepath_label = ready_dir / f"labels_{iFile:04}.npy" np.save(filepath_photo, photos) np.save(filepath_label, labels) iFile = iFile + 1 photos = list() labels = list()
def predict_lineset(sample_model, sample_lineset): model_name = sample_model['name'] model_cluster = sample_model['cluster'] model_real, model_fake = load_model_pair(model_cluster, model_name) lineset_name = sample_lineset['name'] lineset_cluster = sample_lineset['cluster'] df_file = get_ready_data_dir() / f"c_{model_cluster}_{model_name}.pkl" assert df_file.is_file() npy_file = get_ready_data_dir() / f"c_{model_cluster}_{model_name}.npy" assert npy_file.is_file() df = pd.read_pickle(df_file) data = np.load(npy_file) m_fake = (df.fake == True) m_real = (df.fake == False) # A model and a line set, make prediction with errors NUM_CUT = 1000 err_mr_lr = predict(model_real, data[m_real][:NUM_CUT]) err_mf_lr = predict(model_fake, data[m_real][:NUM_CUT]) err_mr_lf = predict(model_real, data[m_fake][:NUM_CUT]) err_mf_lf = predict(model_fake, data[m_fake][:NUM_CUT]) return err_mr_lr, err_mf_lr, err_mr_lf, err_mf_lf
def load_to_series_grayscale(l_x): input_dir = get_ready_data_dir() l_data = [] l_m = [] for x in l_x: filename = input_dir / x assert filename.is_file() i = load_img(filename, color_mode="grayscale") data = np.array(i) data = 1.0 - (data / 255.0) l_data.append(data) l_m.append(True) sValid = pd.Series(l_m) sData = pd.Series(l_data) return (sValid, sData)
def line_pairs(): line_dir = get_ready_data_dir() assert line_dir.is_dir() line_files = list (sorted(line_dir.iterdir())) line_files = [x.name for x in line_files] line_files = [x[2:] for x in line_files] cluster = [x.split("_")[0] for x in line_files] name = [x.split("_")[1] for x in line_files] ext = [x[-3:] for x in name] name = [x[:-4] for x in name] df_c = pd.DataFrame({'name': name, 'cluster': cluster, 'ext': ext}) sCount = df_c.groupby('name').size() df_c = df_c.assign(numfiles = df_c.name.map(sCount)) m_single = df_c.numfiles == 1 print (f"Dropping singles {m_single.sum()}") df_c = df_c[~m_single].reset_index(drop = True) df_c = df_c.drop(['ext', 'numfiles'], axis = 1) m = df_c.duplicated(subset = 'name') df_c = df_c[~m].reset_index(drop = True) return df_c
def process_videoset(iCluster, original): input_dir = get_ready_data_dir() output_dir = get_model_dir() input_df = input_dir / f"c_{iCluster}_{original}.pkl" input_npy = input_dir / f"c_{iCluster}_{original}.npy" isInputExisting = input_df.is_file() and input_npy.is_file() if not isInputExisting: # print (f"Missing input for {iCluster}_{original}") return output_model_real = output_dir / f"c_{iCluster}_{original}_real.h5" output_model_fake = output_dir / f"c_{iCluster}_{original}_fake.h5" isOutputExisting = output_model_real.is_file( ) and output_model_fake.is_file() if isOutputExisting: print(f"{iCluster}_{original} already created") return print(f"Processing c_{iCluster}_{original}...") df = pd.read_pickle(input_df) data = np.load(input_npy) m_fake = (df.fake == True) m_real = (df.fake == False) mse_fake, model_fake = train_model(data[m_fake]) mse_real, model_real = train_model(data[m_real]) print(f"c_{iCluster}_{original}: mse_fake {mse_fake} mse_real {mse_real}") model_fake.save(output_model_fake) model_real.save(output_model_real)
def load_to_series_rgb(l_x): input_dir = get_ready_data_dir() l_data = [] l_m = [] for x in l_x: filename = input_dir / x assert filename.is_file() i = load_img(filename, color_mode="rgb") #plt.imshow(i) #plt.show() data = np.array(i) # 0 - red # 1 - green # 2 - blue #plt.imshow(data) #plt.show() pData = np.zeros(data.shape, dtype=np.float32) pData[:, :, 0] = (data[:, :, 0] - 123.68) / 58.393 pData[:, :, 1] = (data[:, :, 0] - 116.779) / 57.12 pData[:, :, 2] = (data[:, :, 0] - 103.939) / 57.375 l_data.append(pData) l_m.append(True) sValid = pd.Series(l_m) sData = pd.Series(l_data) return (sValid, sData)
def create_test_video_chunks(iPartMin, iPartMax): assert iPartMax > iPartMin data_dir = get_ready_data_dir() l_files = list(data_dir.iterdir()) l_files_out = [] for x in l_files: l_x = str(x.stem).split("_") if len(l_x) != 7: continue if l_x[0] != 'test': continue if l_x[1] == 'meta': continue iMin = int(l_x[4]) iMax = int(l_x[6]) assert iMax > iMin if (iMin >= iPartMin) and (iMax <= iPartMax): pass else: continue metafile = _get_meta_file(iMin, iMax) if metafile.is_file(): pass else: continue l_files_out.append((x, metafile)) """c""" l_test = [] l_meta = [] for x in l_files_out: anTest = np.load(x[0]) df_meta = pd.read_pickle(x[1]) assert anTest.shape[0] == df_meta.shape[0] l_test.append(anTest) l_meta.append(df_meta) anTest = np.concatenate(l_test) df_meta = pd.concat(l_meta, ignore_index=True) z_video = df_meta.iPart.astype('str') + "_" + df_meta.video azVideo = np.unique(z_video) for ix, x in enumerate(azVideo): m = z_video == x anVideoData = anTest[m] zRealFake = df_meta[m].y.iloc[0] zOut = data_dir / f"te_{iPartMin}_{iPartMax}_{ix:04}_{zRealFake}" np.save(zOut, anVideoData)
l_eye_c = np.array(face['l_eye']) bb_min = l_eye_c - eye_size bb_max = l_eye_c + eye_size arScale = np.array([x_max, y_max]) bb_min = (bb_min * arScale).astype(np.int32) bb_max = (bb_max * arScale).astype(np.int32) outputsize = 128 + 64 im_mask, im_test = cut_frame(bb_min, bb_max, video_real, video_fake, iFrame, outputsize, False) im_test.save(get_ready_data_dir() / f"test_{iFrame:003}.png") im_mask.save(get_ready_data_dir() / f"test_{iFrame:003}_m.png") # run ImageAligner l_data = [] for iFrame in range(127): im_test0 = Image.open(get_ready_data_dir() / f"cut_test_{iFrame:003}.png") im_test1 = Image.open(get_ready_data_dir() / f"cut_test_{iFrame + 1:003}.png") array0 = np.asarray(im_test0) array1 = np.asarray(im_test1)
def create_train_chunks(iPartMin, iPartMax, nGBInternal): assert iPartMax > iPartMin assert nGBInternal > 5 data_dir = get_ready_data_dir() l_files = list(data_dir.iterdir()) l_files_out = [] for x in l_files: l_x = str(x.stem).split("_") if len(l_x) != 7: continue if l_x[0] != 'train': continue iMin = int(l_x[4]) iMax = int(l_x[6]) assert iMax > iMin if (iMin >= iPartMin) and (iMax <= iPartMax): pass else: continue l_files_out.append(x) shuffle(l_files_out) size_row_bytes = 64 * 3 * 4 size_internal_bytes = nGBInternal * 1024 * 1024 * 1024 max_internal_rows = int(size_internal_bytes / size_row_bytes) max_out_rows = 1000000 l_data = [] num_rows_internal = 0 iFile = 0 for idx, x in enumerate(l_files_out): isLastFile = (idx == (len(l_files_out) - 1)) print(f"loading {x}...") anData = np.load(x) assert anData.shape[ 0] <= max_internal_rows, "single file exceeds internal buffer size" num_rows_internal = num_rows_internal + anData.shape[0] l_data.append(anData.copy()) if isLastFile or (num_rows_internal > max_internal_rows): print( f"Writing out. {num_rows_internal} > {max_internal_rows} or last file" ) anData = np.concatenate(l_data) np.random.shuffle(anData) num_rows_out = anData.shape[0] num_chunks = int(1 + num_rows_out / max_out_rows) print( f" Writing out. {num_rows_out} lines in {num_chunks} chunks") l_data = np.array_split(anData, num_chunks) for data_chunk in l_data: file_out = data_dir / f"tr_{iPartMin}_{iPartMax}_{iFile:04}.npy" np.save(file_out, data_chunk) print(f" saved chunk with {data_chunk.shape[0]} lines") iFile = iFile + 1 l_data = [] num_rows_internal = 0
def _get_meta_file(iMin, iMax): data_dir = get_ready_data_dir() filename = data_dir / f"test_meta_p_{iMin}_p_{iMax}.pkl" return filename
def create_test_merge(iPartMin, iPartMax): assert iPartMax > iPartMin l_test_parts = list(range(iPartMin, iPartMax)) num_length = 32 input_dir = get_output_dir() assert input_dir.is_dir() output_dir = get_ready_data_dir() assert output_dir.is_dir() d_f = get_feature_converter() l_files = list(input_dir.iterdir()) l_files = [x for x in l_files if x.suffix == '.npy'] l_data_test = {} for zFeature in list(d_f.keys()): l_data_test[zFeature] = [] l_iPart = [] l_zVideo = [] l_y = [] for x in l_files: l_x = str(x.stem).split("_") isTestFile = (len(l_x) == 6) and (l_x[1] == 'Test') if isTestFile: pass else: continue iPart = int(l_x[3]) video = l_x[4] y = l_x[5] isCollect = (iPart in l_test_parts) if isCollect: pass else: continue data = np.load(x) if is_error_line(data): continue anFeature = data[:, 0] data = data[:, 1:] data = data.reshape(-1, num_length, 3) num_rows = data.shape[0] assert num_rows % len(d_f.keys()) == 0 num_rows_per_feature = num_rows // len(d_f.keys()) l_iPart.extend([iPart] * num_rows_per_feature) l_zVideo.extend([video] * num_rows_per_feature) l_y.extend([y] * num_rows_per_feature) for zFeature in list(d_f.keys()): iF = d_f[zFeature] m_correct_feature = (anFeature == iF) l_data_test[zFeature].append(data[m_correct_feature]) assert data[m_correct_feature].shape[0] == num_rows_per_feature num_meta = len(l_iPart) for zFeature in list(d_f.keys()): if len(l_data_test[zFeature]) > 0: anDataTest = np.concatenate(l_data_test[zFeature]) assert anDataTest.shape[0] == num_meta np.save( output_dir / f"test_{zFeature}_p_{iPartMin}_p_{iPartMax}.npy", anDataTest) else: print(f"No data: test_{zFeature}_p_{iPartMin}_p_{iPartMax}") df_meta = pd.DataFrame({'iPart': l_iPart, 'video': l_zVideo, 'y': l_y}) df_meta.to_pickle(output_dir / f"test_meta_p_{iPartMin}_p_{iPartMax}.pkl")
def create_train_merge(iPartMin, iPartMax): assert iPartMax > iPartMin l_train_parts = list(range(iPartMin, iPartMax)) num_length = 32 input_dir = get_output_dir() assert input_dir.is_dir() output_dir = get_ready_data_dir() assert output_dir.is_dir() d_f = get_feature_converter() l_files = list(input_dir.iterdir()) l_files = [x for x in l_files if x.suffix == '.npy'] l_data_train = {} for zFeature in list(d_f.keys()): l_data_train[zFeature] = [] for x in l_files: l_x = str(x.stem).split("_") isTrainFile = (len(l_x) == 6) and (l_x[1] == 'Pair') if isTrainFile: pass else: continue iPart = int(l_x[3]) original = l_x[4] fake = l_x[5] isCollect = (iPart in l_train_parts) if isCollect: pass else: continue data = np.load(x) if is_error_line(data): continue anFeature = data[:, 0] data = data[:, 1:] data = data.reshape(-1, num_length * 2, 3) for zFeature in list(d_f.keys()): iF = d_f[zFeature] m_correct_feature = (anFeature == iF) l_data_train[zFeature].append(data[m_correct_feature]) for zFeature in list(d_f.keys()): if len(l_data_train[zFeature]) > 0: anDataTrain = np.concatenate(l_data_train[zFeature]) np.save( output_dir / f"train_{zFeature}_p_{iPartMin}_p_{iPartMax}.npy", anDataTrain)
def process_part(iCluster): isDraw = False assert get_ready_data_dir().is_dir() output_dir = get_ready_data_dir() / f"c2_{iCluster}" if output_dir.is_dir(): pass else: output_dir.mkdir() assert output_dir.is_dir() v = VideoManager.VideoManager() l_d = v.get_cluster_metadata(iCluster) outputsize = 128 + 64 mtcnn_detector = MTCNNDetector() orig_path = Path( "C:\\Users\\T149900\\Downloads\\dfdc_train_part_07\\dfdc_train_part_7\\crnbqgwbmt.mp4" ) orig_path.is_file() test_path = Path( "C:\\Users\\T149900\\Downloads\\dfdc_train_part_07\\dfdc_train_part_7\\nwzwoxfcnl.mp4" ) test_path.is_file() for entry in l_d: orig_path = entry[0] print(str(orig_path)) try: orig_video = read_video(orig_path, 0) except Exception as err: print(err) continue z_max = orig_video.shape[0] y_max = orig_video.shape[1] x_max = orig_video.shape[2] l_all = entry[1] l_all.append(orig_path) for test_path in l_all: print(" " + str(test_path)) iSample = 0 filename_base = f"{test_path.stem}" try: test_video = read_video(test_path, 0) except Exception as err: print(err) continue is_identical_format = (test_video.shape[0] == z_max) and ( test_video.shape[1] == y_max) and (test_video.shape[2] == x_max) if not is_identical_format: print("Not identical formats") continue d_faces = find_spaced_out_faces_boxes(mtcnn_detector, test_video, 30) for i in range(10): z_sample = np.random.choice(range(0, z_max)) bb_min, bb_max = get_random_face_box_from_z( d_faces, z_sample, x_max, y_max, z_max) im_mask, im_real, im_test = cut_frame(bb_min, bb_max, orig_video, test_video, z_sample, -1, False) filename = filename_base + f"_{iSample:003}" im_test.save(output_dir / (filename + "_t.png")) im_real.save(output_dir / (filename + "_r.png")) im_mask.save(output_dir / (filename + "_m.png")) iSample = iSample + 1
def train_stage2(zFeature, zModel_type, iPartMin, iPartMax): input_dir = get_ready_data_dir() pred_dir = get_pred0_dir() data_file = input_dir / f"test_{zFeature}_p_{iPartMin}_p_{iPartMax}.npy" assert data_file.is_file() meta_file = input_dir / f"test_meta_p_{iPartMin}_p_{iPartMax}.pkl" assert meta_file.is_file() prediction_file = output_dir / f"pred_{zFeature}_p_{iPartMin}_p_{iPartMax}_{zModel_type}.npy" assert prediction_file.is_file() m2_file = model_dir / f"m2_{zFeature}_p_{iPartMin}_p_{iPartMax}_{zModel_type}.txt" data = np.load(data_file) data = preprocess_input(data.reshape(-1, 32, 3)) df_meta = pd.read_pickle(meta_file) data_p = np.load(prediction_file) assert data.shape[0] == df_meta.shape[0] assert data.shape == data_p.shape zVideo = df_meta['iPart'].astype('str') + df_meta['video'] azVideo = np.array(zVideo) azVideoUnique = np.unique(azVideo) aiVideo = np.searchsorted(azVideoUnique, azVideo) df_meta = df_meta.assign(id=aiVideo) df_meta = df_meta.drop(['iPart', 'video'], axis=1) aID = np.unique(df_meta.id) l = [] for id in aID: m_id = df_meta.id == id y = df_meta[m_id].iloc[0].y lines_in = data[m_id] lines_out = data_p[m_id] d_acc = get_accumulated_stats(lines_in, lines_out) d_acc['y'] = y l.append(d_acc) df = pd.DataFrame(l) num_rows = df.shape[0] num_train = int(0.9 * num_rows) x_cols = [x for x in list(df.columns) if x != 'y'] X_train = df[x_cols][:num_train] X_test = df[x_cols][num_train:] y = df.y.copy() m_fake = (y == 'fake') m_real = (y == 'real') y[m_fake] = '1' y[m_real] = '0' y = y.astype(np.int) y_train = y[:num_train] y_test = y[num_train:] params = { 'objective': 'binary', 'learning_rate': 0.01, 'num_leaves': 5, 'feature_fraction': 0.64, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'boosting_type': 'gbdt', 'metric': 'binary_logloss' } # making lgbm datasets for train and valid d_train = lgbm.Dataset(X_train, y_train) d_valid = lgbm.Dataset(X_test, y_test) m2 = lgbm.train(params, d_train, 290, valid_sets=[d_train, d_valid], verbose_eval=1) _ = m2.save_model(str(m2_file))
model.add(Dense(64, activation='relu')) model.add(Dropout(0.1)) model.add(Dense(16, activation='relu')) model.add(Dropout(0.1)) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.summary() return model ready_dir = get_ready_data_dir() model = get_model_dense() filepath_photo = ready_dir / f"photos_0001.npy" filepath_label = ready_dir / f"labels_0001.npy" anTest = np.load(filepath_photo) #anTest = anTest.reshape(-1, 32, 256 * 3) anYTest = np.load(filepath_label) for x in range(1): print(f"Processing {x:04}...") filepath_photo = ready_dir / f"photos_{x:04}.npy" filepath_label = ready_dir / f"labels_{x:04}.npy"