def starfile2df(cls, filepath, data_folder=None, max_rows=None): if data_folder is not None: if not os.path.isabs(data_folder): data_folder = os.path.join(os.path.dirname(filepath), data_folder) else: data_folder = os.path.dirname(filepath) # Note: Valid Relion image "_data.star" files have to have their data in the first loop of the first block. # We are getting the first (and only) block in this StarFile object df = StarFile(filepath).get_block_by_index(0) column_types = {name: cls.metadata_fields.get(name, str) for name in df.columns} df = df.astype(column_types) df[["__mrc_index", "__mrc_filename"]] = df["_rlnImageName"].str.split( "@", 1, expand=True ) df["__mrc_index"] = pd.to_numeric(df["__mrc_index"]) # Adding a full-filepath field to the Dataframe helps us save time later # Note that os.path.join works as expected when the second argument is an absolute path itself df["__mrc_filepath"] = df["__mrc_filename"].apply( lambda filename: os.path.join(data_folder, filename) ) if max_rows is None: return df else: return df.iloc[:max_rows]
def testArgsError(self): with self.assertRaises(StarFileError): _blocks = OrderedDict() _blocks[""] = DataFrame(["test", "data"]) with importlib_resources.path(tests.saved_test_data, "sample_data_model.star") as path: StarFile(filepath=path, blocks=_blocks)
def testSave(self): # Save the StarFile object to disk, # read it back, and check for equality. # Note that __eq__ is supported for StarFile/StarFileBlock classes with open("sample_saved.star", "w") as f: self.starfile.save(f) self.starfile2 = StarFile("sample_saved.star") self.assertEqual(self.starfile, self.starfile2) os.remove("sample_saved.star")
def testReadWriteReadBack(self): # Save the StarFile object to a .star file # Read it back for object equality # Note that __eq__ is supported for the class # it checks the equality of the underlying OrderedDicts of DataFrames # using pd.DataFrame.equals() test_outfile = os.path.join(self.tmpdir, "sample_saved.star") self.starfile.write(test_outfile) starfile2 = StarFile(test_outfile) self.assertEqual(self.starfile, starfile2) os.remove(test_outfile)
def setUp(self): with importlib_resources.path(tests.saved_test_data, "sample_data_model.star") as path: self.starfile = StarFile(path) # Independent Image object for testing Image source methods L = 768 self.im = Image(misc.face(gray=True).astype("float64")[:L, :L]) self.img_src = ArrayImageSource(self.im) # We also want to flex the stack logic. self.n = 21 im_stack = np.broadcast_to(self.im.data, (self.n, L, L)) # make each image methodically different im_stack = np.multiply(im_stack, np.arange(self.n)[:, None, None]) self.im_stack = Image(im_stack) self.img_src_stack = ArrayImageSource(self.im_stack) # Create a tmpdir object for this test instance self._tmpdir = tempfile.TemporaryDirectory() # Get the directory from the name attribute of the instance self.tmpdir = self._tmpdir.name
def write_star(self, df1, df2, ang, cs, voltage, pixel_size, amp, name, output_dir): """ Writes CTF parameters to starfile. """ if not os.path.isdir(output_dir): os.mkdir(output_dir) data_block = {} data_block["_rlnMicrographName"] = name data_block["_rlnDefocusU"] = df1 data_block["_rlnDefocusV"] = df2 data_block["_rlnDefocusAngle"] = ang data_block["_rlnSphericalAbberation"] = cs data_block["_rlnAmplitudeContrast"] = amp data_block["_rlnVoltage"] = voltage data_block["_rlnDetectorPixelSize"] = pixel_size df = DataFrame([data_block]) blocks = OrderedDict() blocks["root"] = df star = StarFile(blocks=blocks) star.write( os.path.join(output_dir, os.path.splitext(name)[0]) + ".star")
def testWriteReadWriteBack(self): # setup our temp filenames test_outfile = os.path.join(self.tmpdir, "sample_saved.star") test_outfile2 = os.path.join(self.tmpdir, "sampled_saved2.star") # create a new StarFile object directly via an OrderedDict of DataFrames # not by reading a file data = OrderedDict() # note that GEMMI requires the names of the fields to start with _ # initialize a key-value set (a set of pairs in GEMMI parlance) block0 = { "_key1": "val1", "_key2": "val2", "_key3": "val3", "_key4": "val4" } # initialize a single-row loop. we want this to be distinct from a # set of key-value pairs block1_dict = {"_field1": 31, "_field2": 32, "_field3": 33} block1 = DataFrame([block1_dict], columns=block1_dict.keys()) block2_keys = ["_field4", "_field5", "_field6"] block2_arr = [[f"{x}{y}" for x in range(3)] for y in range(3)] # initialize a loop data block with a list of lists block2 = DataFrame(block2_arr, columns=block2_keys) data["pair"] = block0 data["single_row"] = block1 data["loops"] = block2 # initialize with blocks kwarg original = StarFile(blocks=data) original.write(test_outfile) read_back = StarFile(test_outfile) # assert that the read-back objects are equal self.assertEqual(original, read_back) # write back the second star file object read_back.write(test_outfile2) # compare the two .star files line by line with open(test_outfile) as f_original, open( test_outfile2) as f_read_back: lines_original = f_original.readlines() lines_read_back = f_read_back.readlines() self.assertEqual(lines_original, lines_read_back) os.remove(test_outfile) os.remove(test_outfile2)
class StarFileTestCase(TestCase): def setUp(self): with importlib_resources.path(tests.saved_test_data, "sample_data_model.star") as path: self.starfile = StarFile(path) # Independent Image object for testing Image source methods L = 768 self.im = Image(misc.face(gray=True).astype("float64")[:L, :L]) self.img_src = ArrayImageSource(self.im) # We also want to flex the stack logic. self.n = 21 im_stack = np.broadcast_to(self.im.data, (self.n, L, L)) # make each image methodically different im_stack = np.multiply(im_stack, np.arange(self.n)[:, None, None]) self.im_stack = Image(im_stack) self.img_src_stack = ArrayImageSource(self.im_stack) # Create a tmpdir object for this test instance self._tmpdir = tempfile.TemporaryDirectory() # Get the directory from the name attribute of the instance self.tmpdir = self._tmpdir.name def tearDown(self): # Destroy the tmpdir instance and contents self._tmpdir.cleanup() def testLength(self): # StarFile is an iterable that gives us blocks # blocks are pandas DataFrames # We have 6 blocks in our sample starfile. self.assertEqual(6, len(self.starfile)) def testIteration(self): # A StarFile can be iterated over, yielding DataFrames for loops # or dictionaries for pairs for _, block in self.starfile: self.assertTrue( isinstance(block, DataFrame) or isinstance(block, dict)) def testBlockByIndex(self): # We can use get_block_by_index to retrieve the blocks in # the OrderedDict by index # our first block is a set of pairs, represented by a dict block0 = self.starfile.get_block_by_index(0) self.assertTrue(isinstance(block0, dict)) self.assertEqual(block0["_rlnReferenceDimensionality"], "3") # our second block is a loop, represented by a DataFrame block1 = self.starfile.get_block_by_index(1) self.assertTrue(isinstance(block1, DataFrame)) self.assertEqual(block1.at[0, "_rlnClassDistribution"], "1.000000") def testBlockByName(self): # Indexing a StarFile with a string gives us a block with that name # ("data_<name>" in starfile). # the block at index 0 has the name 'model_general' block0 = self.starfile["model_general"] # this block is a pair/dict with 22 key value pairs self.assertEqual(len(block0), 22) # the block at index 1 has name 'model_classes' block1 = self.starfile["model_classes"] # This block is a loop/DF with one row self.assertEqual(len(block1), 1) def testData(self): df = self.starfile["model_class_1"] self.assertEqual(76, len(df)) self.assertEqual(8, len(df.columns)) # Note that no typecasting of values is performed at io.StarFile level self.assertEqual( "0.000000", df[df["_rlnSpectralIndex"] == "0"].iloc[0]["_rlnResolution"]) def testFileNotFound(self): with self.assertRaises(FileNotFoundError): StarFile("badfile.star") def testReadWriteReadBack(self): # Save the StarFile object to a .star file # Read it back for object equality # Note that __eq__ is supported for the class # it checks the equality of the underlying OrderedDicts of DataFrames # using pd.DataFrame.equals() test_outfile = os.path.join(self.tmpdir, "sample_saved.star") self.starfile.write(test_outfile) starfile2 = StarFile(test_outfile) self.assertEqual(self.starfile, starfile2) os.remove(test_outfile) def testWriteReadWriteBack(self): # setup our temp filenames test_outfile = os.path.join(self.tmpdir, "sample_saved.star") test_outfile2 = os.path.join(self.tmpdir, "sampled_saved2.star") # create a new StarFile object directly via an OrderedDict of DataFrames # not by reading a file data = OrderedDict() # note that GEMMI requires the names of the fields to start with _ # initialize a key-value set (a set of pairs in GEMMI parlance) block0 = { "_key1": "val1", "_key2": "val2", "_key3": "val3", "_key4": "val4" } # initialize a single-row loop. we want this to be distinct from a # set of key-value pairs block1_dict = {"_field1": 31, "_field2": 32, "_field3": 33} block1 = DataFrame([block1_dict], columns=block1_dict.keys()) block2_keys = ["_field4", "_field5", "_field6"] block2_arr = [[f"{x}{y}" for x in range(3)] for y in range(3)] # initialize a loop data block with a list of lists block2 = DataFrame(block2_arr, columns=block2_keys) data["pair"] = block0 data["single_row"] = block1 data["loops"] = block2 # initialize with blocks kwarg original = StarFile(blocks=data) original.write(test_outfile) read_back = StarFile(test_outfile) # assert that the read-back objects are equal self.assertEqual(original, read_back) # write back the second star file object read_back.write(test_outfile2) # compare the two .star files line by line with open(test_outfile) as f_original, open( test_outfile2) as f_read_back: lines_original = f_original.readlines() lines_read_back = f_read_back.readlines() self.assertEqual(lines_original, lines_read_back) os.remove(test_outfile) os.remove(test_outfile2) def testArgsError(self): with self.assertRaises(StarFileError): _blocks = OrderedDict() _blocks[""] = DataFrame(["test", "data"]) with importlib_resources.path(tests.saved_test_data, "sample_data_model.star") as path: StarFile(filepath=path, blocks=_blocks) def testEmptyInit(self): empty = StarFile() self.assertTrue(isinstance(empty.blocks, OrderedDict)) self.assertEqual(len(empty.blocks), 0)
def testEmptyInit(self): empty = StarFile() self.assertTrue(isinstance(empty.blocks, OrderedDict)) self.assertEqual(len(empty.blocks), 0)
def testFileNotFound(self): with self.assertRaises(FileNotFoundError): StarFile("badfile.star")
class StarFileTestCase(TestCase): def setUp(self): with importlib_resources.path(tests.saved_test_data, "sample.star") as path: self.starfile = StarFile(path) # Independent Image object for testing Image source methods L = 768 self.im = Image(misc.face(gray=True).astype("float64")[:L, :L]) self.img_src = ArrayImageSource(self.im) # We also want to flex the stack logic. self.n = 21 im_stack = np.broadcast_to(self.im.data, (self.n, L, L)) # make each image methodically different im_stack = np.multiply(im_stack, np.arange(self.n)[:, None, None]) self.im_stack = Image(im_stack) self.img_src_stack = ArrayImageSource(self.im_stack) # Create a tmpdir object for this test instance self._tmpdir = tempfile.TemporaryDirectory() # Get the directory from the name attribute of the instance self.tmpdir = self._tmpdir.name def tearDown(self): # Destroy the tmpdir instance and contents self._tmpdir.cleanup() def testLength(self): # StarFile is an iterable that gives us blocks. # We have 2 blocks in our sample starfile. self.assertEqual(2, len(self.starfile)) def testIteration(self): # A StarFile can be iterated over, yielding StarFileBlocks for block in self.starfile: self.assertTrue(isinstance(block, StarFileBlock)) def testBlockByIndex(self): # Indexing a StarFile with a 0-based index gives us a 'block', block0 = self.starfile[0] self.assertTrue(isinstance(block0, StarFileBlock)) # Our first block has no 'loop's. self.assertEqual(0, len(block0)) def testBlockByName(self): # Indexing a StarFile with a string gives us a block with that name # ("data_<name>" in starfile). # In our case the block at index 1 has name 'planetary' block1 = self.starfile["planetary"] # This block has a two 'loops'. self.assertEqual(2, len(block1)) def testBlockProperties(self): # A StarFileBlock may have attributes that were read from the # starfile key=>value pairs. block0 = self.starfile["general"] # Note that no typecasting is performed self.assertEqual(block0._three, "3") def testLoop(self): loop = self.starfile[1][0] self.assertIsInstance(loop, DataFrame) def testData1(self): df = self.starfile["planetary"][0] self.assertEqual(8, len(df)) self.assertEqual(4, len(df.columns)) # Note that no typecasting of values is performed at io.StarFile level self.assertEqual("1", df[df["_name"] == "Earth"].iloc[0]["_gravity"]) def testData2(self): df = self.starfile["planetary"][1] self.assertEqual(3, len(df)) self.assertEqual(2, len(df.columns)) # Missing values in a loop default to '' self.assertEqual( "", df[df["_name"] == "Earth"].iloc[0]["_discovered_year"]) def testSave(self): # Save the StarFile object to disk, # read it back, and check for equality. # Note that __eq__ is supported for StarFile/StarFileBlock classes with open("sample_saved.star", "w") as f: self.starfile.save(f) self.starfile2 = StarFile("sample_saved.star") self.assertEqual(self.starfile, self.starfile2) os.remove("sample_saved.star")
def save_metadata(self, starfile_filepath, new_mrcs=True, batch_size=512, save_mode=None): """ Save updated metadata to a STAR file :param starfile_filepath: Path to STAR file where we want to save image_source :param new_mrcs: Whether to save all images to new MRCS files or not. If True, new file names and pathes need to be created. :param batch_size: Batch size of images to query from the `ImageSource` object. Every `batch_size` rows, entries are written to STAR file. :param save_mode: Whether to save all images in a single or multiple files in batch size. :return: None """ df = self._metadata.copy() # Drop any column that doesn't start with a *single* underscore df = df.drop( [ str(col) for col in df.columns if not col.startswith("_") or col.startswith("__") ], axis=1, ) with open(starfile_filepath, "w") as f: if new_mrcs: # Create a new column that we will be populating in the loop below # For df["_rlnImageName"] = "" if save_mode == "single": # Save all images into one single mrc file fname = os.path.basename(starfile_filepath) fstem = os.path.splitext(fname)[0] mrcs_filename = f"{fstem}_{0}_{self.n-1}.mrcs" # Then set name in dataframe for the StarFile # Note, here the row_indexer is :, representing all rows in this data frame. # df.loc will be reponsible for dereferencing and assigning values to df. # Pandas will assert df.shape[0] == self.n df.loc[:, "_rlnImageName"] = [ f"{j + 1:06}@{mrcs_filename}" for j in range(self.n) ] else: # save all images into multiple mrc files in batch size for i_start in np.arange(0, self.n, batch_size): i_end = min(self.n, i_start + batch_size) num = i_end - i_start mrcs_filename = (os.path.splitext( os.path.basename(starfile_filepath))[0] + f"_{i_start}_{i_end-1}.mrcs") # Note, here the row_indexer is a slice. # df.loc will be reponsible for dereferencing and assigning values to df. # Pandas will assert the lnegth of row_indexer equals num. row_indexer = df[i_start:i_end].index df.loc[row_indexer, "_rlnImageName"] = [ "{0:06}@{1}".format(j + 1, mrcs_filename) for j in range(num) ] filename_indices = df._rlnImageName.str.split( pat="@", expand=True)[1].tolist() # initial the star file object and save it starfile = StarFile(blocks=[StarFileBlock(loops=[df])]) starfile.save(f) return filename_indices
def save_metadata( self, starfile_filepath, new_mrcs=True, batch_size=512, save_mode=None ): """ Save updated metadata to a STAR file :param starfile_filepath: Path to STAR file where we want to save image_source :param new_mrcs: Whether to save all images to new MRCS files or not. If True, new file names and pathes need to be created. :param batch_size: Batch size of images to query from the `ImageSource` object. Every `batch_size` rows, entries are written to STAR file. :param save_mode: Whether to save all images in a single or multiple files in batch size. :return: None """ df = self._metadata.copy() # Drop any column that doesn't start with a *single* underscore df = df.drop( [ str(col) for col in df.columns if not col.startswith("_") or col.startswith("__") ], axis=1, ) filename_indices = None with open(starfile_filepath, "w") as f: if new_mrcs: # Create a new column that we will be populating in the loop below # For df["_rlnImageName"] = "" if save_mode == "single": # Save all images into one single mrc file fname = os.path.basename(starfile_filepath) fstem = os.path.splitext(fname)[0] mrcs_filename = f"{fstem}_{0}_{self.n-1}.mrcs" # Then set name in dataframe for the StarFile df["_rlnImageName"][0 : self.n] = pd.Series( [f"{j + 1:06}@{mrcs_filename}" for j in range(self.n)] ) else: # save all images into multiple mrc files in batch size for i_start in np.arange(0, self.n, batch_size): i_end = min(self.n, i_start + batch_size) num = i_end - i_start mrcs_filename = ( os.path.splitext(os.path.basename(starfile_filepath))[0] + f"_{i_start}_{i_end-1}.mrcs" ) df["_rlnImageName"][i_start:i_end] = pd.Series( [ "{0:06}@{1}".format(j + 1, mrcs_filename) for j in range(num) ] ) filename_indices = [ df["_rlnImageName"][i].split("@")[1] for i in range(self.n) ] # initial the star file object and save it starfile = StarFile(blocks=[StarFileBlock(loops=[df])]) starfile.save(f) return filename_indices