def main(video_source_directory_path, count, fps, width, height): path_list = Path(video_source_directory_path).glob("**/*.mp4") current_count = 0 print("Generating pickle files according to the given config...") for path in path_list: if current_count >= count: break else: print("\n\n\nCOUNT: " + str(current_count), end="\n\n\n") video_source_file_name = str(path.stem) + ".mp4" print("Processing video file: " + video_source_file_name) print("Preprocessing...") start = time.time() frames = pp.get_frames(video_file_source_path=str(path), req_fps=fps, width=width, height=height) mid = time.time() print("Time required for preprocessing is: " + str(mid - start)) print("Generating pickle dump...") ph.generate_pickle_list(video_name=str(path.stem), frames=frames) end = time.time() print("Time required for generating pickle file is: " + str(end - mid)) print("Total time required for saving a video is: " + str(end - start)) current_count += 1
def test_sift_extract(): video_source_file_path = "../dataset/videos/200_512kb.mp4" test_frames_file_path = "frames/" fps = 2 width = 480 height = 360 frames = pp.get_frames(video_file_source_path=video_source_file_path, req_fps=fps, width=width, height=height) c = 0 file = open("d.txt", "a") for f in frames: # key_points = lf.extract_sift_key_points(image=f) # descriptors = lf.extract_sift_descriptors(image=f) kp, d = lf.extract_sift_keypoints_and_descriptors(image=f, limit=100) if d is not None: file.write(str(d)) # print(type(key_points), end=' ') # print(type(descriptors)) # print(c, end=' ') # if key_points is not None: # print(len(key_points), end=' ') # else: # pass # if descriptors is None: # print("none") # else: # pass # c += 1 # print(len(key_points)) # print(len(descriptors)) file.close()
def main(): video_source_file_path = "../dataset/videos/test.mp4" test_frames_file_path = "frames/" fps = 30 width = 480 height = 360 frames = preprocessing.get_frames(video_file_source_path=video_source_file_path, fps=fps, width=width, height=height) c = 0 for frame in frames: cv2.imwrite(test_frames_file_path + str(c) + ".png", frame) c += 1
def test_pickling(): video_source_file_path = "../dataset/videos/200_512kb.mp4" test_frames_file_path = "frames/" fps = 30 width = 480 height = 360 frames = pp.get_frames(video_file_source_path=video_source_file_path, fps=fps, width=width, height=height) c = 0 key_points = lf.extract_sift_key_points(image=frames[20]) descriptors = lf.extract_sift_descriptors(image=frames[20]) p = pickle.dumps(key_points) print(len(p)) d_new = pickle.loads(p)
def get_audio_image(tr_data): # we keep track of the number of videos we cannot process num_skipped_videos = 0 #numpy array to hold all W matrices W_all = np.zeros((len(tr_data), 2401, 25)) for count in tqdm(range(len(tr_data))): sample = tr_data[count] url = 'https://www.youtube.com/watch?v=' + sample[0] video_start_time = sample[1] # Download from local video file if (url): os.system("ffmpeg -ss " + str(video_start_time) + " -i $(youtube-dl -i -f 37/22/18 -g \'" + url + "\') -t " + str(10) + " -c copy video.mp4 >/dev/null 2>&1") os.system("ffmpeg -i video.mp4 audio.wav >/dev/null 2>&1") # obtain cv2.VideoCapture obj from downloaded video if success cap = cv2.VideoCapture("video.mp4") else: print("Error in downloading youtube video") if not os.path.exists("./video.mp4"): num_skipped_videos += 1 continue # load audio from file ts, sr = li.core.load("./audio.wav", sr=48000) # skip if audio is shorter than 10 seconds if (len(ts) < 10 * sr): os.remove("./audio.wav") os.remove("./video.mp4") print("\n\n\n Sample {} is too short to be processed.".format( sample[0])) print("Namely, the sample is {} seconds long.\n\n\n".format( len(ts) / sr)) num_skipped_videos += 1 continue s = ts[0:10 * sr] # cut audio into exact 10 seconds if it's longer than that all_image_tensors, skip = get_frames( cap) # get all the transformed frames # skip the current video if error occured during the frame extraction process if skip: num_skipped_videos += 1 print("\n\n\nUnable to extract all frames from sample {}\n\n\n". format(sample[0])) if os.path.exists('./audio.wav'): os.remove('./audio.wav') if os.path.exists('./video.mp4'): os.remove('./video.mp4') for k in range(skip): if os.path.exists('frame{}.jpg'.format(k)): os.remove('frame{}.jpg'.format(k)) continue max_pool_labels = get_frame_labels( all_image_tensors) # get predicted labels for captured frames # create the set of basis vectors and object labels for each audio sample if count == 0: # call the NMF algorithm W_all = np.expand_dims(extract_bases(ts), 0) # extract audio into audio bases labels_all = max_pool_labels.detach().unsqueeze( 0) # use predicted maxpool labels else: W = extract_bases(ts) # extract audio into audio bases W_all = np.concatenate( (W_all, np.expand_dims(W, 0))) # append audio bases into list labels_all = torch.cat( (labels_all, max_pool_labels.detach().unsqueeze(0)), 0) # remove all the captured images, downloaded video and audio for i in range(10): os.remove('./frame{}.jpg'.format(i)) os.remove('./video.mp4') os.remove('./audio.wav') # write data to h5 file every 500 samples in case lose connection # write audio frequency bases and Resnet maxpool labels into h5 file if (count % 500 == 0): with h5py.File('./test_data.h5', 'w') as hdf5: hdf5.create_dataset('bases', data=W_all) hdf5.create_dataset('labels', data=labels_all) # dump audio frequency bases and Resnet maxpool labels into h5 file with h5py.File('./test_data.h5', 'w') as hdf5: hdf5.create_dataset('bases', data=W_all) hdf5.create_dataset('labels', data=labels_all) print("{} samples were skipped.".format(num_skipped_videos))
#from tensorflow.keras import Sequential #from tensorflow.keras.layers import Dense, Dropout #from tensorflow.keras.layers import Conv2D, MaxPool2D #from tensorflow.keras.optimizers import Adam #print(tf.__version__) import pandas as pd import numpy as np from preprocessing import get_frames import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix Fs = 10 frame_size = Fs*2 # 20 hop_size = Fs*1 # 10 X_train, y_train = get_frames(frame_size, hop_size) X_ = np.concatenate((X_train, y_train), axis =1) #np.random.shuffle(X) X = X_[:,:-1] y = X_[:,-1] #X = dataset.iloc[:, :-1].values #y = dataset.iloc[:, -1:].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler
return_elements=['new_all_output:0']) new_output=tf.identity(output,name='new_output') print(new_output) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.per_process_gpu_memory_fraction = 0.6 config.gpu_options.allow_growth = True results=[] with tf.Session(config=config) as sess: fid = 0 for vcount, vid in enumerate(annotations): print('Processing sequence {}...'.format(vid['name'])) frames, _ = get_frames(vid['full_path'], skip=skip) for frame in frames: # do sw inference image=normalize(frame)[np.newaxis, ...] quantized_image=quantize_input_image(image,5) pred=sess.run(new_output,feed_dict={input_:quantized_image}) print(pred.shape) boxes, scores = get_bboxes(np.squeeze(pred), anchors, nms_params) for j in range(0, len(boxes)): box = boxes[j].tolist() results.append( dict(image_id=fid, category_id=1, bbox=box, score=scores[j].tolist(), height=box[3])) if not len(boxes): # if there are no detections results.append(dict(image_id=fid, category_id=0, bbox=[0, 0, 0, 0], score=0))
def main(): # retrieve the audio basis vectors for each object object_dict = disentangle() if os.path.exists('./audio.wav'): os.remove('./audio.wav') if os.path.exists('./video.mp4'): os.remove('./video.mp4') # test video video_url = 'https://www.youtube.com/watch?v=DOn33Ugbefw' if (video_url): os.system("ffmpeg -ss " + str(105) + " -i $(youtube-dl -i -f 37/22/18 -g \'" + video_url + "\') -t " + str(10) + " -c copy video.mp4 >/dev/null 2>&1") os.system("ffmpeg -i video.mp4 audio.wav >/dev/null 2>&1") # obtain cv2.VideoCapture obj from downloaded video if success cap = cv2.VideoCapture("video.mp4") else: print("Error in downloading youtube video") # load audio file ts, sr = librosa.core.load("./audio.wav", sr=48000) # skip if audio is shorted than 10 seconds if (len(ts) < 10 * sr): os.remove("./audio.wav") os.remove("./video.mp4") print("\n\n\nSample {} is too short to be processed.".format(1)) print("Namely, the sample is {} seconds long.\n\n\n".format( len(ts) / sr)) exit(1) # crop to 10 seconds if audio is longer ts = ts[0:sr * 10] all_image_tensors, skip = get_frames(cap) # get all the transformed frames # skip video if error in frame extraction process if skip: print("\n\n\nUnable to extract all frames from sample {}\n\n\n".format( 1)) if os.path.exists('./audio.wav'): os.remove('./audio.wav') if os.path.exists('./video.mp4'): os.remove('./video.mp4') for k in range(skip): if os.path.exists('frame{}.jpg'.format(k)): os.remove('frame{}.jpg'.format(k)) exit(1) # get predicted labels for captured frames max_pool_labels = get_frame_labels(all_image_tensors) # reshape the labels into (1000,) and perform softmax on labels labels = max_pool_labels.detach().unsqueeze(0).numpy().astype( float).reshape(1000, ) softmax_labels = np.exp(labels) / np.sum(np.exp(labels), axis=0) # we take the top 4 objects in the scene and intersect with the piano/violin/guitar/drum labels labels = set(softmax_labels.argsort()[-4:][::-1]).intersection( set([889, 579, 881, 402, 541])) # reindex the labels of drum/guitar/piano/violin for convenience labels_new = [] start_index = 0 # sep holds the start index for each concatenated W matrix sep = [start_index] # append audio basis vectors of each object in columns for i in labels: if i == 541: labels_new.append('drum') start_index += object_dict['drum'].shape[1] sep.append(start_index) elif i == 402: labels_new.append('guitar') start_index += object_dict['guitar'].shape[1] sep.append(start_index) elif (i == 579 or i == 881) and 'piano' not in labels_new: labels_new.append('piano') start_index += object_dict['piano'].shape[1] sep.append(start_index) elif (i == 889): labels_new.append('violin') start_index += object_dict['violin'].shape[1] sep.append(start_index) print("Objects in test video: ", labels_new) # the last index is the number of basis vectors in the concatenated W matrix num_basis_vectors = sep[-1] # W shape (num_of_frequency_bins, num_of_basis_vectors) W = np.zeros((2401, num_basis_vectors)) # concatenate audio bases of each object into W in columns for index, object in enumerate(labels_new): W[:, sep[index]:sep[index + 1]] = object_dict[object] # get spectrograms of audio spec, magnitude_spec, phase_spec = get_spectrograms(ts) V = magnitude_spec # W_transpose is used as the fixed "H" in the NMF procedure W_transpose = W.T assert (W_transpose.shape == (num_basis_vectors, 2401)) # Since sklearn can only solve V=WH while keeping the H fixed, we solve the factorization: # V^T = H^T*W^T, and take the transpose of the resultant matrix H_t to retreive H. H_t, _, _ = non_negative_factorization(X=V.T, H=W_transpose, n_components=num_basis_vectors, init='random', update_H=False, max_iter=1500, verbose=1) H = H_t.T V_dict = {} #append to dictionary of object spectrograms for i, object in enumerate(labels_new): V_dict[object] = np.matmul(object_dict[object], H[sep[i]:sep[i + 1]]) assert (V_dict[object].shape == (2401, 201)) #calculate sum of all object magnitude spectrograms V_sum = np.zeros((2401, 201)) for V_obj in V_dict.values(): V_sum = V_sum + V_obj #mask the spectrogram, compute istft and write to wav file sample_rate = 48000 for i, object in enumerate(labels_new): # softmask the mixture spectrogram double_V_j = (V_dict[object] / (V_sum)) * spec # use istft to reconstruct time domain signal from sprectrogram source_j = librosa.core.istft(double_V_j, hop_length=2400) # write reconstructed signal into wav file for testing print("Writing to ./{}.wav...".format(object)) librosa.output.write_wav('./{}.wav'.format(object), source_j, sample_rate) # remove generated audio, video and frame images if os.path.exists('./audio.wav'): os.remove('./audio.wav') if os.path.exists('./video.mp4'): os.remove('./video.mp4') for i in range(10): if os.path.exists('./frame{}'.format(i)): os.remove('./frame{}'.format(i))
from sqlalchemy.orm import relationship from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker import pickle import preprocessing as pp import local_features as lf Base = declarative_base() video_source_file_path = "../dataset/videos/200_512kb.mp4" test_frames_file_path = "frames/" fps = 30 width = 480 height = 360 frames = pp.get_frames(video_file_source_path=video_source_file_path, fps=fps, width=width, height=height) c = 0 key_points = lf.extract_sift_key_points(image=frames[20]) descriptors = lf.extract_sift_descriptors(image=frames[20]) p = pickle.dumps(descriptors) class Video(Base): __tablename__ = 'local_features_video' # Here we define columns for the table person # Notice that each column is also a normal Python instance attribute. id = Column(Integer, primary_key=True) video_name = Column(LargeBinary(length=(2**32) - 1), nullable=False)