def download_synthetic(version, images_dir): temp = temp_dir() file_handle, _ = urlretrieve(version["url"]) with zipfile.ZipFile(file_handle, 'r') as zipObj: zipObj.extractall(temp) labels_dir = path.join(temp, "hand_labels_synth") for split in ['synth1', 'synth2', 'synth3', 'synth4']: original_split_dir = path.join(labels_dir, split) split_dir = path.join(images_dir, split) makedir(split_dir) files = sorted( [f for f in os.listdir(original_split_dir) if f.endswith('.json')]) for file in files: content = json.load(open(path.join(original_split_dir, file), "r")) fname = file.replace(".json", ".jpg") copyfile(path.join(original_split_dir, fname), path.join(split_dir, fname)) yield split, { "image": "/".join(["images", split, fname]), "hand": "left" if content["is_left"] else "right", "pose": content["hand_pts"] }
def pose_video(datum): if not exists(datum['video']): try: URLopener().retrieve(datum["video_url"], datum["video"]) except Exception as e: makedir(datum["pose_dir"]) # Empty directory if not exists(datum["pose_dir"]): gpu = get_empty_gpu() # Create Container container_id = Docker.create_container(DOCKER_NAME, "-it -v " + datum["video"] + ":/video.mp4") def remove_container(): Docker.remove_container(container_id) try: # Start Container Docker.start_container(container_id) cmd = "./build/examples/openpose/openpose.bin --video /video.mp4 --model_pose BODY_25 --display 0 --render_pose 0 --write_json /out/ --hand --face --num_gpu 1 " cmd += " --num_gpu_start " + str(gpu) Docker.exec_container(container_id, "bash -c 'cd /openpose && " + cmd + "'") # Copy files Docker.cp_container_directory(container_id, datum["pose_dir"], "/out/") except Exception as e: remove_container() raise e finally: remove_container() return True
def cp_container_directory(container_id: str, local_dir: str, docker_dir: str): makedir(local_dir) d_cp = "nvidia-docker cp " + container_id + ":" + docker_dir + ". " + local_dir print(d_cp) status = os.system(d_cp) if int(status) != 0: raise Exception("CP Status " + str(status))
def download(version, directory: str, dataset: list): if version["version"] != "Mediapipe": raise ValueError("Running this addon version is not implemented") poses_dir = path.join(directory, "poses") makedir(poses_dir) Docker.verify_image_exists(DOCKER_NAME) should_cleanup = False while True: existing = {path.join(poses_dir, di) for di in os.listdir(poses_dir)} missing_data = [] for datum in dataset: datum["pose_dir"] = path.join(poses_dir, datum["id"]) if datum["pose_dir"] not in existing: missing_data.append(datum) # Break when finished if len(missing_data) == 0: break print(missing_data) should_cleanup = True print("Done", len(dataset) - len(missing_data), "/", len(dataset), "tasks") # should_cleanup = False # for datum in tqdm(missing_data): # pose_video(datum) distributed.clear_tasks() distributed.kill_slaves() clean_dockers() distributed.spawn_workers().flower() distributed.run(pose_video, missing_data[:50000]) if should_cleanup: distributed.kill_slaves() clean_dockers() with jsonlines.open(path.join(directory, "index.jsonl"), mode='w') as writer: for datum in tqdm(dataset): writer.write({ "id": datum["id"], "poses": get_directory_hands(datum["pose_dir"]) })
def download(directory: str, version, module_path: str, dataset=None): makedir(directory) version_dir = path.join(directory, version["version"]) index_path = path.join(version_dir, 'index.jsonl') if not exists(version_dir) or not exists(index_path): makedir(version_dir) module = modular_import("module", module_path) if dataset is None: module.download(version, version_dir) else: module.download(version, version_dir, dataset) data = list(jsonlines.open(index_path)) return version_dir, data
def download_digits(images_dir): from keras.datasets import mnist mnist_data = mnist.load_data() for split, (x, y) in zip(["train", "test"], mnist_data): split_dir = path.join(images_dir, split) makedir(split_dir) for i, (image, label) in enumerate(zip(x, y)): f_name = str(label) + "_" + str(i) + ".png" Image.fromarray(image).save(path.join(split_dir, f_name)) yield split, { "label": str(label), "image": "/".join([split, f_name]) }
def download(version, directory): images_dir = path.join(directory, "images") makedir(images_dir) if version["version"] == "manual": res = download_manual(version, images_dir) elif version["version"] == "synthetic": res = download_synthetic(version, images_dir) else: raise ValueError("Downloading this version is not implemented") splits = defaultdict(list) with jsonlines.open(path.join(directory, "index.jsonl"), mode='w') as writer: for i, (split, row) in tqdm(enumerate(res)): splits[split].append(i) writer.write(row) json.dump(splits, open(path.join(directory, 'split.json'), "w"))
def download_manual(version, images_dir): temp = temp_dir() file_handle, _ = urlretrieve(version["url"]) with zipfile.ZipFile(file_handle, 'r') as zipObj: zipObj.extractall(temp) labels_dir = path.join(temp, "hand_labels") for split in ["train", "test"]: original_split_dir = path.join(labels_dir, "manual_" + split) split_dir = path.join(images_dir, split) makedir(split_dir) files = sorted( [f for f in os.listdir(original_split_dir) if f.endswith('.json')]) for file in files: content = json.load(open(path.join(original_split_dir, file), "r")) fname = file.replace(".json", ".jpg") # Crop image all_x, all_y, _ = zip(*content["hand_pts"]) size = round( max(max(all_x) - min(all_x), max(all_y) - min(all_y)) / 2) x = min(all_x) - size y = min(all_y) - size im = Image.open(path.join(original_split_dir, fname)) crop = im.crop((x, y, x + 4 * size, y + 4 * size)) crop.save(path.join(split_dir, fname)) yield split, { "image": "/".join(["images", split, fname]), "hand": "left" if content["is_left"] else "right", "pose": [(x1 - x, y1 - y, z) for x1, y1, z in content["hand_pts"]] }
def download_SpreadTheSign(directory): makedir(path.join(directory, "videos")) # Initialize MultiProcessing Pool processes = multiprocessing.cpu_count() - 1 pool = Pool(processes) # First gets the list of words and their languages print("Indexing SpreadTheSign...") words = index_words(directory, pool, processes) # For every ID and language, get the metadata print("Getting metadata for each sign...") data_index_path = path.join(directory, "data_index.json") data = json.load(open(data_index_path)) if exists(data_index_path) else [] existing = {"_".join(d["id"].split("_")[:-1]) for d in data} videos = [(i, l) for i, languages in words.items() for l in languages if str(i) + "_" + str(l) not in existing] for chunk in tqdm(chunks(videos, processes * 10)): data += list(itertools.chain.from_iterable(pool.imap(get_video, list(chunk)))) json.dump(data, open(data_index_path, "w"), indent=2) return data
def download_sign_language(version, images_dir): labels = string.ascii_lowercase temp = temp_dir() file_handle, _ = urlretrieve(version["url"]) with zipfile.ZipFile(file_handle, 'r') as zipObj: zipObj.extractall(temp) for split in ["train", "test"]: split_dir = path.join(images_dir, split) makedir(split_dir) csv = [[int(r) for r in row.split(",")] for row in open(path.join(temp, "sign_mnist_" + split + ".csv")).readlines()[1:]] for i, row in enumerate(csv): label = labels[row.pop(0)] image = np.array(row, dtype=np.uint8).reshape((28, 28)) f_name = label + "_" + str(i) + ".png" Image.fromarray(image).save(path.join(split_dir, f_name)) yield split, {"label": label, "image": "/".join([split, f_name])}
import os from os import path from os.path import exists from jsonlines import jsonlines from tqdm import tqdm from addons.OpenPose.pose_util import get_directory_person from utils.dataset import load from utils.file_system import makedir, listdir if __name__ == "__main__": version = {"version": "BODY_25"} directory = "/home/nlp/amit/PhD/meta-scholar/datasets/SLCrawl/versions/SpreadTheSign/OpenPose/BODY_25" # dataset = load("SLCrawl", version="SpreadTheSign") poses_dir = path.join(directory, "poses") makedir(poses_dir) existing = {path.join(poses_dir, d) for d in os.listdir(poses_dir)} print("Finished", len(existing))
def execute(self, run_name=None, tabs=0, x_params=None, previous_name=None, cache_name: str = None): # Every execution should refresh initial params params = CachedDict().union(self.initial_params) \ if isinstance(self.initial_params, CachedDict) \ else CachedDict(self.initial_params if self.initial_params else {}) self.local_timer = Time.now() self.global_timer = Time.now() if not x_params: x_params = CachedDict() x_params = x_params.union(params) # Add F to X if not previous_name: previous_name = cache_dir makedir(previous_name) if cache_name: previous_name = path.join(previous_name, cache_name) makedir(previous_name) if run_name: print(" " * tabs, run_name) key_len = max([len(qi.key) for qi in self.queue] + [0]) + 5 name_len = max([len(qi.name) for qi in self.queue] + [0]) + 5 for qi in self.queue: # key, name, method, load_cache, load_self if qi.key != "out" and not isinstance(qi.method, Pipeline): print((" " * (tabs + 1)) + ("%-" + str(key_len) + "s %-" + str(name_len) + "s") % (qi.key, qi.name), end=" ") pn = path.join(previous_name, qi.key) pnf = pn + "." + qi.ext if qi.load_cache and path.isfile(pnf): params.add_cache(qi.key, pnf) if qi.load_self: params.load_cache(qi.key) else: if isinstance(qi.method, Pipeline): params[qi.key] = qi.method.execute( run_name=qi.name, tabs=tabs + 1, x_params=x_params.union(params), previous_name=pn) if isinstance(params[qi.key], CachedDict) and "out" in params[qi.key]: params.copy_key(qi.key, params[qi.key], "out") else: if self.mute: Silencer.mute() params[qi.key] = qi.method(params, x_params) if self.mute: Silencer.unmute() f = open(pnf, "wb" if qi.ext not in ["txt", "json"] else "w") if qi.ext == "pkl": pickle.dump(params[qi.key], f) else: if qi.ext in ["png", "jpg", "wav", "mp4"]: params[qi.key] = get_file_bytes(params[qi.key], format=qi.ext) f.write(params[qi.key]) f.close() if qi.key != "out" and not isinstance(qi.method, Pipeline): local_passed, global_passed = self.timer_report() report = params[qi.key].report() \ if qi.key in params.val_dict and hasattr(params[qi.key], "report") else "" print(("%-15s\t\t" + report) % (local_passed)) return params
def download(version, directory): raw_dir = path.join(directory, "raw") makedir(raw_dir) print("The PIG dataset requires authentication, which is granted to everyone who requests it.") print("You can register in: http://beam.kisarazu.ac.jp/~saito/research/PianoFingeringDataset/register.php") print("") username = input("Insert your username:"******"Insert your password:"******"url"] # Make further requests authenticated password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password(None, url, username, password) handler = urllib.request.HTTPBasicAuthHandler(password_mgr) urllib.request.install_opener(urllib.request.build_opener(handler)) file_handle, _ = urlretrieve(url) with zipfile.ZipFile(file_handle, 'r') as zipObj: zipObj.extractall(raw_dir) print("Download done") raw_dataset_dir = path.join(raw_dir, os.listdir(raw_dir)[0]) metadata = csv.reader(open(path.join(raw_dataset_dir, "List.csv"), 'r')) next(metadata) metadata_id = {int(r[0]): r for r in metadata} fingering_dir = path.join(raw_dataset_dir, "FingeringFiles") with jsonlines.open(path.join(directory, "index.jsonl"), mode='w') as writer: for file in listdir(fingering_dir, full=False): file_id, tagger_id = map(int, file.split("_")[0].split("-")) mata = metadata_id[file_id] datum = { "id": file.split("_")[0], "piece": mata[2], "composer": mata[1], "tagger": mata[5 + tagger_id], "#bars": int(mata[3]), "#notes": int(mata[4]), "notes": [] } f = open(path.join(fingering_dir, file), "r") notes = [r.split() for r in f.read().splitlines()[1:]] f.close() for note in notes: datum["notes"].append({ "on_event": { "time": float(note[1]), "velocity": int(note[4]) }, "off_event": { "time": float(note[2]), "velocity": int(note[5]) }, "spelled_pitch": note[3], "midi_pitch": note_to_midi(note[3]), "channel": int(note[6]), "fingers": [{"finger": abs(f), "hand": "right" if f > 0 else "left"} for f in [int(f) for f in note[7].strip('_').split("_")]] }) writer.write(datum)
import json from collections import Counter from functools import lru_cache from itertools import chain from os.path import isfile import requests from utils.file_system import makedir cache = "/tmp/dbpedia/" makedir(cache) DBPEDIA = "http://dbpedia.org/" @lru_cache(maxsize=None) def normalize_entity(entity: str): return entity \ .replace("/", "%2F") \ .replace("&", "%26") \ .replace("+", "%2B") @lru_cache(maxsize=None) def get_dbpedia_entity(entity: str): entity = normalize_entity(entity) cache_ent = cache + entity + ".json" if isfile(cache_ent): f = open(cache_ent, "r") content = json.load(f)
def download(version, directory: str): FFmpeg.check_installed() dataset_directory = path.join(directory, "dataset") if not path.exists(dataset_directory): dataset_file = path.join(directory, "dataset.tgz") if not path.exists(dataset_file): print("Downloading Archive") wget.download(version["url"], dataset_file) print("Extracting Archive") tar = tarfile.open(dataset_file) tar.extractall(path=dataset_directory) tar.close() remove(dataset_file) archive_directory = path.join(dataset_directory, version["version"]) \ if version["version"] == "ChicagoFSWild" else dataset_directory frames_directory = path.join(archive_directory, "frames") if not path.exists(frames_directory): print("Extracting Frames Archive") tar = tarfile.open( path.join(archive_directory, version["version"] + "-Frames.tgz")) tar.extractall(path=frames_directory) tar.close() if version["version"] == "ChicagoFSWildPlus": frames_directory = path.join(frames_directory, version["version"]) videos_directory = path.join(directory, "videos") makedir(videos_directory) splits = defaultdict(list) data = [] if version["version"] == "ChicagoFSWild": print( "Note! While ChicagoFSWild contains hand bounding boxes, we do not load them at this time." ) with open(path.join(archive_directory, version["version"] + ".csv")) as csv_file: csv_data = csv.reader(csv_file, delimiter=',') next(csv_data) # Ignore the header for i, row in tqdm(enumerate(csv_data)): # As this ID will be used for file naming, lets make it work by default datum_id = row[1].replace("/", "-").replace("_(youtube)", "").replace("_(nad)", "") # Convert Frames to a video datum_frames = path.join(frames_directory, row[1]) datum_video = path.join(videos_directory, datum_id + ".mp4") if not path.exists(datum_video): FFmpeg.video_from_frames(datum_frames, 4, datum_video) data.append({ "id": datum_id, "texts": [{ "text": row[7] }], "gloss": word2chars(row[7]), "description": row[9], "video_url": row[2], "video": datum_video, "timing": { "start": row[3], }, "sign_language": "en.us", "text_language": "en", "signer": { "name": row[-1] }, "metadata": { "frames": row[4], "width": int(row[5]), "height": int(row[6]) } }) splits[row[10]].append(i) with jsonlines.open(path.join(directory, "index.jsonl"), mode='w') as writer: for datum in data: writer.write(datum) json.dump(dict(splits), open(path.join(directory, "split.json"), "w"))
def execute(self, run_name=None, tabs=0, x_params=None, previous_name=None, cache_name: str = None): self.local_timer = Time.now() self.global_timer = Time.now() if not x_params: x_params = CachedDict() if not previous_name: previous_name = cache_dir makedir(previous_name) if cache_name: previous_name = path.join(previous_name, cache_name) makedir(previous_name) if run_name: print(" " * tabs, run_name) key_len = max([len(qi.key) for qi in self.queue] + [0]) + 5 name_len = max([len(qi.name) for qi in self.queue] + [0]) + 5 for qi in self.queue: # key, name, method, load_cache, load_self if qi.key != "out" and not isinstance(qi.method, Pipeline): print((" " * (tabs + 1)) + ("%-" + str(key_len) + "s %-" + str(name_len) + "s") % (qi.key, qi.name), end=" ") pn = path.join(previous_name, qi.key) pnf = pn + "." + qi.ext if qi.load_cache and qi.key != "out" and path.isfile(pnf): self.params.add_cache(qi.key, pnf) if qi.load_self: self.params.load_cache(qi.key) else: if isinstance(qi.method, Pipeline): self.params[qi.key] = qi.method.execute( run_name=qi.name, tabs=tabs + 1, x_params=x_params.union(self.params), previous_name=pn) if "out" in self.params[qi.key]: self.params.copy_key(qi.key, self.params[qi.key], "out") else: if self.mute: Silencer.mute() self.params[qi.key] = qi.method(self.params, x_params) if self.mute: Silencer.unmute() f = open(pnf, "wb" if qi.ext != "txt" else "w") if qi.ext == "sav": pickle.dump(self.params[qi.key], f) else: f.write(self.params[qi.key]) f.close() if qi.key != "out" and not isinstance(qi.method, Pipeline): local_passed, global_passed = self.timer_report() report = self.params[qi.key].report() \ if qi.key in self.params.val_dict and hasattr(self.params[qi.key], "report") else "" print(("%-15s\t\t" + report) % (local_passed)) return self.params
def download_FingerSpell(version, directory): FFmpeg.check_installed() letters = ['rest'] + list(string.ascii_lowercase) animated = {"j": temp_name(".jpg"), "z": temp_name(".jpg")} for l, f in list(animated.items()): urlretrieve(version["url"] + l + "-begin_" + l + "-end.jpg", f) animated[l] = cv2.imread(f) videos_path = path.join(directory, "videos") makedir(videos_path) for l1 in tqdm(letters): for l2 in tqdm(letters): is_l2_animated = l2 in animated is_l1_animated = l1 in animated text = (l1 + l2).replace("rest", "") gloss = "" if l1 == l2 == "rest" else l2 + "#" if l1 == "rest" else "#" + l1 if l2 == "rest" else "#" + l1 + "# #" + l2 + "#" download_l1 = l1 download_l2 = l2 if is_l2_animated: download_l2 = download_l2 + "-begin" if is_l1_animated: download_l1 = download_l1 + "-end" full_url = version["url"] + download_l1 + "_" + download_l2 + ".jpg" video_path = path.join(videos_path, text + ".mp4") if not path.exists(video_path): temp = temp_name(".jpg") urlretrieve(full_url, temp) img = cv2.imread(temp) if is_l2_animated and not is_l1_animated: img = np.concatenate((img, animated[l2])) if is_l1_animated and not is_l2_animated: img = np.concatenate((animated[l1], img)) imgs = img.reshape((int(img.shape[0] / 256), 256, 256, 3)) temp_dir_name = temp_dir() for i, im in enumerate(imgs): cv2.imwrite(temp_dir_name + str(i).zfill(2) + ".jpg", im) FFmpeg.video_from_frames(temp_dir_name, 2, video_path) yield { "id": text if text != "" else "rest", "texts": [{ "text": text }], "gloss": gloss, "video": video_path, "video_url": full_url, "sign_language": "en.us", "text_language": "English", "metadata": { "width": 256, "height": 256 } }