def download_test_resources(args: Args): # Try running the download pipeline try: # Get test resources dir resources_dir = ( Path(__file__).parent.parent / "aicsimageio" / "tests" / "resources" ).resolve() resources_dir.mkdir(exist_ok=True) # Get quilt package package = Package.browse( "aicsimageio/test_resources", "s3://aics-modeling-packages-test-resources", top_hash=args.top_hash, ) # Download package["resources"].fetch(resources_dir) log.info(f"Completed package download.") # Catch any exception except Exception as e: log.error("=============================================") if args.debug: log.error("\n\n" + traceback.format_exc()) log.error("=============================================") log.error("\n\n" + str(e) + "\n") log.error("=============================================") sys.exit(1)
def exec_module(cls, module): """ Module executor. """ name_parts = module.__name__.split('.') registry = get_from_config('default_local_registry') if module.__name__ == 'quilt3.data': # __path__ must be set even if the package is virtual. Since __path__ will be # scanned by all other finders preceding this one in sys.meta_path order, make sure # it points to someplace lacking importable objects module.__path__ = MODULE_PATH return module elif len(name_parts) == 3: # e.g. module.__name__ == quilt3.data.foo namespace = name_parts[2] # we do not know the name the user will ask for, so populate all valid names for pkg in list_packages(): pkg_user, pkg_name = pkg.split('/') if pkg_user == namespace: module.__dict__[pkg_name] = Package.browse( pkg, registry=registry) module.__path__ = MODULE_PATH return module else: assert False
def test_diff(self): new_pkg = Package() # Create a dummy file to add to the package. test_file_name = 'bar' with open(test_file_name, "w") as fd: fd.write('test_file_content_string') test_file = Path(fd.name) # Build a new package into the local registry. new_pkg = new_pkg.set('foo', test_file_name) top_hash = new_pkg.build("Quilt/Test") p1 = Package.browse('Quilt/Test') p2 = Package.browse('Quilt/Test') assert p1.diff(p2) == ([], [], [])
def test_manifest(self): pkg = Package() pkg.set('as/df', LOCAL_MANIFEST) pkg.set('as/qw', LOCAL_MANIFEST) top_hash = pkg.build('foo/bar').top_hash manifest = list(pkg.manifest) pkg2 = Package.browse('foo/bar', top_hash=top_hash) assert list(pkg.manifest) == list(pkg2.manifest)
def test_top_hash_stable(self): """Ensure that top_hash() never changes for a given manifest""" registry = DATA_DIR.as_posix() top_hash = '20de5433549a4db332a11d8d64b934a82bdea8f144b4aecd901e7d4134f8e733' pkg = Package.browse('foo/bar', registry=registry, top_hash=top_hash) assert pkg.top_hash == top_hash, \ "Unexpected top_hash for {}/packages/.quilt/packages/{}".format(registry, top_hash)
def test_rollback(self): p = Package() p.set('foo', DATA_DIR / 'foo.txt') p.build('quilt/tmp') good_hash = p.top_hash assert 'foo' in Package.browse('quilt/tmp') p.delete('foo') p.build('quilt/tmp') assert 'foo' not in Package.browse('quilt/tmp') Package.rollback('quilt/tmp', LOCAL_REGISTRY, good_hash) assert 'foo' in Package.browse('quilt/tmp') with self.assertRaises(QuiltException): Package.rollback('quilt/tmp', LOCAL_REGISTRY, '12345678' * 8) with self.assertRaises(QuiltException): Package.rollback('quilt/blah', LOCAL_REGISTRY, good_hash)
def test_browse_package_from_registry(self): """ Verify loading manifest locally and from s3 """ with patch('quilt3.Package._from_path') as pkgmock: registry = LOCAL_REGISTRY.resolve().as_uri() pkg = Package() pkgmock.return_value = pkg top_hash = pkg.top_hash pkg = Package.browse('Quilt/nice-name', top_hash=top_hash) assert '{}/.quilt/packages/{}'.format(registry, top_hash) \ in [x[0][0] for x in pkgmock.call_args_list] pkgmock.reset_mock() with patch('quilt3.packages.get_bytes') as dl_mock: dl_mock.return_value = (top_hash.encode('utf-8'), None) pkg = Package.browse('Quilt/nice-name') assert registry + '/.quilt/named_packages/Quilt/nice-name/latest' \ == dl_mock.call_args_list[0][0][0] assert '{}/.quilt/packages/{}'.format(registry, top_hash) \ in [x[0][0] for x in pkgmock.call_args_list] pkgmock.reset_mock() remote_registry = 's3://asdf/foo' # remote load pkg = Package.browse('Quilt/nice-name', registry=remote_registry, top_hash=top_hash) assert '{}/.quilt/packages/{}'.format(remote_registry, top_hash) \ in [x[0][0] for x in pkgmock.call_args_list] pkgmock.reset_mock() pkg = Package.browse('Quilt/nice-name', top_hash=top_hash, registry=remote_registry) assert '{}/.quilt/packages/{}'.format(remote_registry, top_hash) \ in [x[0][0] for x in pkgmock.call_args_list] pkgmock.reset_mock() with patch('quilt3.packages.get_bytes') as dl_mock: dl_mock.return_value = (top_hash.encode('utf-8'), None) pkg = Package.browse('Quilt/nice-name', registry=remote_registry) assert '{}/.quilt/packages/{}'.format(remote_registry, top_hash) \ in [x[0][0] for x in pkgmock.call_args_list] # registry failure case with patch('quilt3.packages.get_from_config', return_value=fix_url(os.path.dirname(__file__))): with pytest.raises(FileNotFoundError): Package.browse('Quilt/nice-name')
def download_test_resources() -> None: root = Path(__file__).parent.parent.parent resources = (root / "aicsimageio" / "aicsimageio" / "tests" / "resources").resolve() # Get the specific hash for test resources with open(root / "aicsimageio" / "scripts" / "TEST_RESOURCES_HASH.txt", "r") as f: top_hash = f.readline().strip() # Download test resources resources.mkdir(exist_ok=True) package = Package.browse( "aicsimageio/test_resources", "s3://aics-modeling-packages-test-resources", top_hash=top_hash, ) package["resources"].fetch(resources)
def chart_benchmarks(args: Args): # Check save dir exists or create args.save_dir.mkdir(parents=True, exist_ok=True) # Get file if args.benchmark_file is None: benchmark_filepath = Path("benchmark_results.json") p = Package.browse( "aicsimageio/benchmarks", "s3://aics-modeling-packages-test-resources" ) p["results.json"].fetch(benchmark_filepath) else: benchmark_filepath = args.benchmark_file # Read results file with open(benchmark_filepath, "r") as read_in: all_results = json.load(read_in) # Generate charts for each config per_cluster_results = [] selected_cluster_results = [] for config_name, results in all_results.items(): results = pd.DataFrame(results) results["config"] = config_name # Add to all per_cluster_results.append(results) # Add to primary viz if config_name in SELECTED_CLUSTERS_TO_VISUALIZE: selected_cluster_results.append(results) chart = _generate_chart(results) chart.save(str(args.save_dir / f"{config_name}.png")) # Generate unified chart all_results = pd.concat(per_cluster_results) unified_chart = _generate_chart(all_results) unified_chart.save(str(args.save_dir / "all.png")) # Generate unified primary chart primary_results = pd.concat(selected_cluster_results) unified_chart = _generate_chart(primary_results, sorted=True) unified_chart.save(str(args.save_dir / "primary.png"))
import os import pandas as pd from tqdm import tqdm from quilt3distribute import Dataset from quilt3 import Package # Downlaod the datasets from Quilt if there is no local copy ds_folder = "../database/" if not os.path.exists(os.path.join(ds_folder, "metadata.csv")): pkg = Package.browse("matheus/assay_dev_datasets", "s3://allencell-internal-quilt").fetch(ds_folder) metadata = pd.read_csv(os.path.join(ds_folder, "metadata.csv")) df_meta = pd.read_csv(os.path.join(ds_folder, metadata.database_path[0]), index_col=0) # FOVs that could not be read from the server # We shall come back to this files in the future. fovs_with_read_problems = [40, 135, 462, 2000] # Gathering results df = [] for FOVId in tqdm(df_meta.index): if FOVId not in fovs_with_read_problems:
use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # Download the model weights from Quilt if the folder `best_model` is empty model_weights_path = [] if os.path.exists("../best_model"): for f in os.listdir("../best_model"): # Search for pth files if f.endswith(".pth"): model_weights_path.append(os.path.join("..", "best_model", f)) if not model_weights_path: # Download from Quilt print("No weights were found locally. Downloading from Quilt...") pkg = Package.browse( "matheus/assay_dev_actn2_classifier", "s3://allencell-internal-quilt" ).fetch("../best_model") metadata = pd.read_csv(os.path.join("..", "best_model", "metadata.csv")) model_weights_path = os.path.join("..", "best_model", metadata.model_path[0]) elif len(model_weights_path) > 1: # Use the last one in case more than 1 are found model_weights_path = model_weights_path[-1] print(f"More than 1 weight file found. Using the last one: {model_weights_path}.") else: # Only one file found model_weights_path = model_weights_path[0] # Load weights classifier = cardio_cnn(model_path=model_weights_path) # Segment the images for background calculation
def __init__(self, num_batches, BATCH_SIZE, model_kwargs, shuffle=True, corr=False, train=True, mask=False): """ Args: num_batches: Number of batches of synthetic data BATCH_SIZE: batchsize of synthetic data model_kwargs: dictionary containing "x_dim" which indicates input data size shuffle: True sets condition vector in input data to 0 for all possible permutations corr: True sets dependent input dimensions via a correlation matrix """ self.num_batches = num_batches self.BATCH_SIZE = BATCH_SIZE self.corr = corr self.shuffle = shuffle self.model_kwargs = model_kwargs self.train = train Batches_C_train, Batches_C_test = torch.empty([0]), torch.empty([0]) Batches_X_train, Batches_X_test = torch.empty([0]), torch.empty([0]) Batches_conds_train, Batches_conds_test = torch.empty( [0]), torch.empty([0]) ds = Package.browse("aics/pipeline_integrated_single_cell", "s3://allencell") # Specify path to pre downloaded quilt json files try: path_to_json = model_kwargs['json_quilt_path'] except: path_to_json = "/home/ritvik.vasan/test/" # json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')] meta_to_file_name = [] for f in ds["cell_features"]: meta_to_file_name.append({ "filename": f, **ds["cell_features"][f].meta }) metas = pd.DataFrame(meta_to_file_name) # Specify path to config file for FeatureDatabase try: db = FeatureDatabase(model_kwargs['config_path']) except: db = FeatureDatabase("/home/ritvik.vasan/config.json") t = db.get_pg_table("featuresets", "aics-mitosis-classifier-four-stage_v1.0.0") semi = metas.merge(t, left_on="CellId", right_on="CellId", suffixes=("_meta", "_mito")) # Only interphase or no interphase semi['Interphase and Mitotic Stages [stage]'] = semi[ 'Interphase and Mitotic Stages [stage]'].apply(lambda x: 0 if x == 0.0 else 1) dd = defaultdict(list) for i in range(len(semi['filename'])): this_file = semi['filename'][i] a = json.loads(open(path_to_json + this_file).read()) a = dict([(key, value) for key, value in a.items() if key not in [ 'imsize_orig', 'com', 'angle', 'flipdim', 'imsize_registered' ]]) a.update({'CellId': semi['CellId'][i]}) for key, value in a.items(): dd[key].append(value) features_plus_cellid = pd.DataFrame(dict(dd)) meta_plus_features = pd.merge(semi, features_plus_cellid, on='CellId') i_care_cols = [ c for c in meta_plus_features.columns if c not in [ 'CellId', 'CellIndex', 'FOVId', 'WellId', 'FeatureExplorerURL', 'CellLine', 'Workflow', 'associates', 'filename', 'NucMembSegmentationAlgorithm', 'NucMembSegmentationAlgorithmVersion', 'PlateId' ] ] meta_plus_features = meta_plus_features[i_care_cols] meta_plus_features.dropna(inplace=True) categorical_features = [ 'Gene', 'ProteinDisplayName', 'StructureDisplayName' ] categorical_dataframe = meta_plus_features[categorical_features] non_categorical_dataframe = meta_plus_features[[ c for c in meta_plus_features.columns if c not in categorical_features ]] one_hot_categorical_features = pd.get_dummies(categorical_dataframe, prefix=None, drop_first=True) # num_of_cells = len(non_categorical_dataframe) # This is mean, std normalization non_categorical_dataframe = non_categorical_dataframe.iloc[:, :] # print(non_categorical_dataframe.shape) self._feature_names = [ i for i in non_categorical_dataframe.columns ] + [i for i in one_hot_categorical_features.columns] num_training_samples = 33000 x = non_categorical_dataframe.values std_scaler = preprocessing.StandardScaler() # 0 is binary, dont scale that column x_train_and_test_scaled = std_scaler.fit_transform( x[:, 1:model_kwargs["x_dim"] + 1]) x_train_scaled = std_scaler.fit_transform( x[:num_training_samples, 1:model_kwargs["x_dim"] + 1]) x_test_scaled = std_scaler.transform(x[num_training_samples:, 1:model_kwargs["x_dim"] + 1]) if model_kwargs["x_dim"] > 103: non_categorical_train = pd.DataFrame( np.concatenate((x[:num_training_samples, 0:1], x_train_scaled), axis=1)) non_categorical_test = pd.DataFrame( np.concatenate((x[num_training_samples:, 0:1], x_test_scaled), axis=1)) non_categorical_train_and_test = pd.DataFrame( np.concatenate((x[:, 0:1], x_train_and_test_scaled), axis=1)) # print(non_categorical_train.shape, non_categorical_test.shape) # print(len(self._feature_names)) # print(non_categorical_train_and_test.shape) non_categorical_train_and_test.columns = self._feature_names[:103] else: non_categorical_train = pd.DataFrame(x_train_scaled) non_categorical_test = pd.DataFrame(x_test_scaled) non_categorical_train_and_test = pd.DataFrame( x_train_and_test_scaled) self._feature_names = self._feature_names[1:model_kwargs['x_dim'] + 1] non_categorical_train_and_test.columns = self._feature_names[:] # print(non_categorical_train.shape, non_categorical_test.shape, len(self._feature_names)) # Convert to torch tensor self._non_categorical_dataframe = non_categorical_train_and_test self._categorical_dataframe = one_hot_categorical_features X_train_whole_batch = torch.from_numpy( non_categorical_train.values).float() X_test_whole_batch = torch.from_numpy( non_categorical_test.values).float() all_categorical_X = torch.from_numpy( one_hot_categorical_features.values).float() if model_kwargs["x_dim"] > 103: X_train_whole_batch = torch.cat( (X_train_whole_batch, all_categorical_X[:num_training_samples, :]), 1) X_test_whole_batch = torch.cat( (X_test_whole_batch, all_categorical_X[num_training_samples:, :]), 1) for j, i in enumerate(range(self.num_batches)): X_train = X_train_whole_batch[i * self.BATCH_SIZE:(i + 1) * self.BATCH_SIZE, :] X_test = X_test_whole_batch[i * self.BATCH_SIZE:(i + 1) * self.BATCH_SIZE, :] if X_train.size()[0] != self.BATCH_SIZE: break # print(X_train.size(), X_test.size()) # print(Batches_X_train.size(), Batches_X_test.size()) self._color = X_train[:, 0] C_train = X_train.clone() C_test = X_test.clone() count = 0 if self.shuffle is True: while count == 0: C_mask_train = torch.zeros(C_train.shape).bernoulli_(0.5) C_mask_test = torch.zeros(C_test.shape).bernoulli_(0.5) count = 1 else: C_mask_train = torch.zeros(C_train.shape).bernoulli_(0) C_mask_test = torch.zeros(C_test.shape).bernoulli_(0) C_train[C_mask_train.byte()] = 0 C_train_indicator = C_mask_train == 0 C_test[C_mask_test.byte()] = 0 C_test_indicator = C_mask_test == 0 C_train = torch.cat( [C_train.float(), C_train_indicator.float()], 1) C_test = torch.cat([C_test.float(), C_test_indicator.float()], 1) X_train = X_train.view([1, -1, X_train.size()[-1]]) X_test = X_test.view([1, -1, X_test.size()[-1]]) C_train = C_train.view([1, -1, X_train.size()[-1] * 2]) C_test = C_test.view([1, -1, X_test.size()[-1] * 2]) # Sum up conds_train = C_train[:, :, X_train.size()[-1]:].sum(2) conds_test = C_test[:, :, X_test.size()[-1]:].sum(2) Batches_X_train = torch.cat([Batches_X_train, X_train], 0) Batches_C_train = torch.cat([Batches_C_train, C_train], 0) Batches_conds_train = torch.cat([Batches_conds_train, conds_train], 0) try: Batches_X_test = torch.cat([Batches_X_test, X_test], 0) Batches_C_test = torch.cat([Batches_C_test, C_test], 0) Batches_conds_test = torch.cat( [Batches_conds_test, conds_test], 0) except: pass self._batches_x_train = Batches_X_train self._batches_c_train = Batches_C_train self._batches_conds_train = Batches_conds_train self._batches_x_test = Batches_X_test self._batches_c_test = Batches_C_test self._batches_conds_test = Batches_conds_test
import torch.utils.data as data import torchvision.transforms as transforms import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from functions import load_data, dataset_training, cardio_cnn_resnet_18, train, validation from quilt3 import Package import pandas as pd os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # Load data from quilt p = Package.browse( "matheus/assay_dev_classifier_train", "s3://allencell-internal-quilt" ).fetch("data/") manifest = pd.read_csv("data/metadata.csv", index_col=0) # model save path save_model_path = "./models/" # save Pytorch models # set model parameters data_path = f"data/{manifest.DataPath[0]}" label_lists = ( f"data/{manifest.AnnotationDiffusePath[0]}", f"data/{manifest.AnnotationFibersPath[0]}", f"data/{manifest.AnnotationDisorganizedPunctaPath[0]}", f"data/{manifest.AnnotationOrganizedPunctaPath[0]}", f"data/{manifest.AnnotationOrganizedZDisks[0]}",
def test_remote_browse(self): """ Verify loading manifest from s3 """ registry = 's3://test-bucket' top_hash = 'abcdefgh' * 8 # Make the first request. self.s3_stubber.add_response( method='get_object', service_response={ 'VersionId': 'v1', 'Body': BytesIO(top_hash.encode()), }, expected_params={ 'Bucket': 'test-bucket', 'Key': '.quilt/named_packages/Quilt/test/latest', } ) self.s3_stubber.add_response( method='head_object', service_response={ 'VersionId': 'v1', 'ContentLength': REMOTE_MANIFEST.stat().st_size, }, expected_params={ 'Bucket': 'test-bucket', 'Key': f'.quilt/packages/{top_hash}', } ) self.s3_stubber.add_response( method='get_object', service_response={ 'VersionId': 'v1', 'Body': BytesIO(REMOTE_MANIFEST.read_bytes()), 'ContentLength': REMOTE_MANIFEST.stat().st_size, }, expected_params={ 'Bucket': 'test-bucket', 'Key': f'.quilt/packages/{top_hash}', } ) pkg = Package.browse('Quilt/test', registry=registry) assert 'foo' in pkg # Make the second request. Gets "latest" - but the rest should be cached. self.s3_stubber.add_response( method='get_object', service_response={ 'VersionId': 'v1', 'Body': BytesIO(top_hash.encode()), }, expected_params={ 'Bucket': 'test-bucket', 'Key': '.quilt/named_packages/Quilt/test/latest', } ) pkg2 = Package.browse('Quilt/test', registry=registry) assert 'foo' in pkg2 # Make another request with a top hash. Everything should be cached. pkg3 = Package.browse('Quilt/test', top_hash=top_hash, registry=registry) assert 'foo' in pkg3 # Make a request with a short hash. self.s3_stubber.add_response( method='list_objects_v2', service_response={ 'Contents': [ { 'Key': f'.quilt/packages/{top_hash}', 'Size': 64, }, { 'Key': f'.quilt/packages/{"a" * 64}', 'Size': 64, } ] }, expected_params={ 'Bucket': 'test-bucket', 'Prefix': '.quilt/packages/', } ) pkg3 = Package.browse('Quilt/test', top_hash='abcdef', registry=registry) assert 'foo' in pkg3 # Make a request with a bad short hash. with self.assertRaises(QuiltException): Package.browse('Quilt/test', top_hash='abcde', registry=registry) with self.assertRaises(QuiltException): Package.browse('Quilt/test', top_hash='a' * 65, registry=registry) # Make a request with a non-existant short hash. self.s3_stubber.add_response( method='list_objects_v2', service_response={ 'Contents': [ { 'Key': f'.quilt/packages/{top_hash}', 'Size': 64, }, { 'Key': f'.quilt/packages/{"a" * 64}', 'Size': 64, } ] }, expected_params={ 'Bucket': 'test-bucket', 'Prefix': '.quilt/packages/', } ) with self.assertRaises(QuiltException): Package.browse('Quilt/test', top_hash='123456', registry=registry)
def test_install(self): # Manifest self.s3_stubber.add_response( method='get_object', service_response={ 'VersionId': 'v1', 'Body': BytesIO(b'abcdef'), }, expected_params={ 'Bucket': 'my-test-bucket', 'Key': '.quilt/named_packages/Quilt/Foo/latest', } ) self.s3_stubber.add_response( method='head_object', service_response={ 'VersionId': 'v1', 'ContentLength': REMOTE_MANIFEST.stat().st_size, }, expected_params={ 'Bucket': 'my-test-bucket', 'Key': '.quilt/packages/abcdef', } ) self.s3_stubber.add_response( method='get_object', service_response={ 'VersionId': 'v1', 'Body': BytesIO(REMOTE_MANIFEST.read_bytes()), 'ContentLength': REMOTE_MANIFEST.stat().st_size, }, expected_params={ 'Bucket': 'my-test-bucket', 'Key': '.quilt/packages/abcdef', } ) # Objects self.s3_stubber.add_response( method='get_object', service_response={ 'VersionId': 'v1', 'Body': BytesIO(b'a,b,c'), }, expected_params={ 'Bucket': 'my_bucket', 'Key': 'my_data_pkg/bar.csv', } ) self.s3_stubber.add_response( method='get_object', service_response={ 'VersionId': 'v1', 'Body': BytesIO(b'Hello World!'), }, expected_params={ 'Bucket': 'my_bucket', 'Key': 'my_data_pkg/baz/bat', } ) self.s3_stubber.add_response( method='get_object', service_response={ 'VersionId': 'v1', 'Body': BytesIO('💩'.encode()), }, expected_params={ 'Bucket': 'my_bucket', 'Key': 'my_data_pkg/foo', } ) with patch('quilt3.data_transfer.s3_transfer_config.max_request_concurrency', 1): Package.install('Quilt/Foo', registry='s3://my-test-bucket', dest='package') p = Package.browse('Quilt/Foo') assert p['foo'].get() == 's3://my_bucket/my_data_pkg/foo' # Check that the cache works. local_path = pathlib.Path(p['foo'].get_cached_path()) assert local_path == pathlib.Path.cwd() / 'package/foo' assert local_path.read_text('utf8') == '💩' # Test that get_bytes and get_as_text works assert p['foo'].get_bytes().decode("utf-8") == '💩' assert p['foo'].get_as_string() == '💩' # Check that moving the file invalidates the cache... local_path.rename('foo2') assert p['foo'].get_cached_path() is None # ...but moving it back fixes it. pathlib.Path('foo2').rename(local_path) assert p['foo'].get_cached_path() == str(local_path) # Check that changing the contents invalidates the cache. local_path.write_text('omg') assert p['foo'].get_cached_path() is None # Check that installing the package again reuses the cached manifest and two objects - but not "foo". self.s3_stubber.add_response( method='get_object', service_response={ 'VersionId': 'v1', 'Body': BytesIO(b'abcdef'), }, expected_params={ 'Bucket': 'my-test-bucket', 'Key': '.quilt/named_packages/Quilt/Foo/latest', } ) self.s3_stubber.add_response( method='get_object', service_response={ 'VersionId': 'v1', 'Body': BytesIO('💩'.encode()), }, expected_params={ 'Bucket': 'my_bucket', 'Key': 'my_data_pkg/foo', } ) with patch('quilt3.data_transfer.s3_transfer_config.max_request_concurrency', 1): Package.install('Quilt/Foo', registry='s3://my-test-bucket', dest='package/')