def main(): parser = argparse.ArgumentParser() #will probmot help instead of erro message parser.add_argument("-dicom_dir", help='direcotry of the dicom files', type=str) parser.add_argument( "-output_dir", help='output file name where all the .png files will be stored', type=str) parser.add_argument("-num_workers", help='number_of_cpu', type=int, default=16) args = parser.parse_args() try: os.mkdir(args.output_dir) except: pass convert_to_png_ = partial(convert_to_png, dir_img=args.output_dir) parallel(convert_to_png_, list(Path(args.dicom_dir).iterdir()), max_workers=args.num_workers)
def create_dataset(path_fullRes: Path, path_list, downsize=True): il = ImageList.from_folder(path_fullRes) for p, size, qf in path_list: if not p.exists(): print(f"Creating {p}") print(f"Size: {size} with {qf} quality factor") parallel(partial(create_training_images, p_hr=path_fullRes, p_lr=p, size=size, qualityFactor=qf, downsize=downsize), il.items)
def preprocess_parallel(arr, size, directory, target): args = [(name, size, directory, target) for name in arr] status = parallel(preProcess, args) if status == len(arr): print("STATUS_CODE == OK") else: print("STATUS_CODE == NOT ALL FILES COULD BE LOADED {}/{}".format( sum(status), len(arr)))
def get_all_issue_text(owner, repo, inf_wrapper, workers=64): """ Prepare embedding features of all issues in a given repository. Returns ------ dict {'features':list, 'labels':list, 'nums':list} """ # prepare list of issue nums max_num = find_max_issue_num(owner, repo) get = partial(get_issue_text, owner=owner, repo=repo, skip_issue=True) issues = parallel(get, list(range(1, max_num + 1)), max_workers=workers) # filter out issues with problems filtered_issues = [] for issue in issues: if issue: filtered_issues.append(issue) logging.info(f'Retrieved {len(filtered_issues)} issues.') features = [] labels = [] nums = [] for issue in tqdm(filtered_issues): labels.append(issue['labels']) nums.append(issue['num']) # calculate embedding text = inf_wrapper.process_dict(issue)['text'] feature = inf_wrapper.get_pooled_features(text).detach().cpu() # only need the first 1600 dimensions features.append(feature[:, :1600]) assert len(features) == len( labels ), 'Error you have mismatch b/w number of observations and labels.' return { 'features': torch.cat(features).numpy(), 'labels': labels, 'nums': nums }
def get_all_issue_text(owner, repo, inf_wrapper, workers=64): """ Prepare embedding features of all issues in a given repository. Returns ------ dict {'features':list, 'labels':list, 'nums':list} """ # prepare list of issue nums max_num = find_max_issue_num(owner, repo) get = partial(get_issue_text, owner=owner, repo=repo, skip_issue=True) issues = parallel(get, list(range(1, max_num + 1)), max_workers=workers) # filter out issues with problems filtered_issues = [] if not issues: raise ValueError(f"No issues retrieved for {owner}/{repo}") for issue in issues: if issue: filtered_issues.append(issue) logging.info( f'Repo {owner}/{repo} Retrieved {len(filtered_issues)} issues.') features = [] labels = [] nums = [] issues_dict = {'title': [], 'body': []} for issue in tqdm(filtered_issues): labels.append(issue['labels']) nums.append(issue['num']) issues_dict['title'].append(issue['title']) issues_dict['body'].append(issue['body']) features = inf_wrapper.df_to_embedding(pd.DataFrame.from_dict(issues_dict)) assert len(features) == len( labels ), 'Error you have mismatch b/w number of observations and labels.' return {'features': features[:, :1600], 'labels': labels, 'nums': nums}
def download_wiki_images(wiki_csv, dest, dest_csv=None, force_download=False, max_rows=None, max_workers=8): """ Download wiki images from the wiki csv. Save a csv with an added column to the path of the image. """ dest_csv = dest_csv or os.path.join(dest, 'downloaded.csv') def try_convert_to_list(x): try: r = ast.literal_eval(x) return r if isinstance(r, list) else None except: return None if not os.path.exists(dest): os.makedirs(dest) if os.path.exists(dest_csv) and not force_download: df = pd.read_csv(dest_csv) df['images'] = df['images'].apply(try_convert_to_list) df = df[df['images'].notnull()] return df df = pd.read_csv(wiki_csv) df['images'] = df['images'].apply(try_convert_to_list) df = df[df['images'].notnull()] if max_rows != None: df = df.iloc[:max_rows] paths = parallel(partial(download_single_image, dest), df['images'], max_workers=max_workers) df['image_path'] = pd.Series(dict(paths)).drop_duplicates() df = df[df['image_path'].notnull()] df.to_csv(dest_csv, index=False) return df
def __call__(self, items): toks = [] if isinstance(items[0], Path): items = [read_file(i) for i in items] chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))] toks = parallel(self.proc_chunk, chunks, max_workers=8) return sum(toks, [])
def get_largest_img_size(img_list, path=None, max_workers=20): if path: img_list = [f'{path}/{fn}' for fn in img_list] imgs_shape = np.array( parallel(get_img_dimension, img_list, max_workers=max_workers)).T return [max(imgs_shape[0]), max(imgs_shape[1]), max(imgs_shape[2])]
def window_and_normalize(im): rescaled = im.pixel_array * float(im.RescaleSlope) + float(im.RescaleIntercept) windowed = rescaled.clip(min=window_center-window_width, max=window_center+window_width) return (windowed + np.negative(window_center-window_width)) / (window_width * 2 * 1/255) plt.imshow(window_and_normalize(im), cmap=plt.cm.bone) def resize(src, dst, sz): im = pydicom.read_file(str(src)) ary = window_and_normalize(im) im = PIL.Image.fromarray(ary.astype(np.int8), mode='L') im.resize((sz,sz), resample=PIL.Image.BICUBIC).save(f'{dst}/{src.stem}.png') import pandas as pd df = pd.read_csv('/media/docear/My Passport/Kaggle/Hemorrhage/stage_1_train.csv') # path to your CSV df[df.ID.str.match('ID_6431af929')] df = df[~df.ID.str.match('ID_6431af929')] df.to_csv(revisedtrainCSVpath, index=False) print('Processing Train Set') def resize_112(path, _): resize(path, '/media/docear/My Passport/Kaggle/Hemorrhage/data/112/train', 112) # set the destination file address to a new folder that will accept the processed training set parallel(resize_112, list(paths.iterdir()), max_workers=12) print('Processing Test Set') def resize_112_test(path, _): resize(path, '/media/docear/My Passport/Kaggle/Hemorrhage/data/112/test', 112) # set the destination file address to a new folder that will accept the processed test set parallel(resize_112_test, list(pathsTest.iterdir()), max_workers=12)
numOfImages = len(files) t = time.time() for file in files: with Image.open(os.path.join(INPATH, file)) as im: for i in range(1, tilesPerImage + 1): newname = file.replace('.', '_{:03d}.'.format(i)) w, h = im.size x = random.randint(0, w - dx - 1) y = random.randint(0, h - dy - 1) #print("Cropping {}: {},{} -> {},{}".format(file, x,y, x+dx, y+dy)) crop = im.crop((x, y, x + dx, y + dy)) resize = crop.resize((200, 200), Image.ANTIALIAS) resize.save(os.path.join(OUTPATH, newname)) t = time.time() - t print("Done {} images in {:.2f}s".format(numOfImages, t)) print("({:.1f} images per second)".format(numOfImages / t)) print("({:.1f} tiles per second)".format(tilesPerImage * numOfImages / t)) INPATH = r"data" LIST = ['1', '2', '5', '8'] OUTPATH = r"resized" CROP = 256 NUMBER_CROPS = 5 RESIZE = 200 parallel(generate(INPATH, LIST, OUTPATH, CROP, NUMBER_CROPS, RESIZE), [x for x in range(len(os.listdir(INPATH)))])
def main(): model = StegNet(10, 6) print("Created Model") if args.train: data_train = ImageLoader(args.datapath + '/train', args.num_train, args.fourierSeed, args.size, args.bs) data_val = ImageLoader(args.datapath + '/val', args.num_val, args.fourierSeed, args.size, args.bs) data = DataBunch(data_train, data_val) print("Loaded DataSets") if args.model is not None: model.load_state_dict(torch.load(args.model)) print("Loaded pretrained model") loss_fn = mse learn = Learner(data, model, loss_func=loss_fn, metrics=[mse_cov, mse_hidden]) print("training") fit_one_cycle(learn, args.epochs, 1e-2) torch.save(learn.model.state_dict(), "model.pth") print("model saved") else: path = input( "Enter path of the model: ") if args.model is None else args.model model.load_state_dict(torch.load(args.model)) model.eval() if args.encode: f_paths = [ args.datapath + '/cover/' + f for f in os.listdir(args.datapath + '/cover') ] try: os.mkdir(args.datapath + '/encoded') except OSError: pass fourier_func = partial(encrypt, seed=args.fourierSeed) encode_partial = partial(encode, model=model.encoder, size=args.size, fourier_func=fourier_func) parallel(encode_partial, f_paths) else: f_paths = [ args.datapath + '/encoded/' + f for f in os.listdir(args.datapath + '/encoded') ] try: os.mkdir(args.datapath + '/decoded') except OSError: pass fourier_func = partial(decrypt, seed=args.fourierSeed) decode_partial = partial(decode, model=model.decoder, size=args.size, fourier_func=fourier_func) parallel(decode_partial, f_paths)