def text_write(spec: Spectrum, path: str, filename: str) -> None: """ Writes an ASCII formatted spec file with appropriate header information. Format can be read in by spec_load_write.text_load() method :param spec: spectrum to be written :param path: /path/to/write/ :param filename: filename to write in path :type spec: Spectrum :type path: str :type filename: str :return: None :rtype: None """ dirCheck(path) with open(join(path, filename), 'w') as outfile: header = "namestring=%s,z=%f,gmag=%f%s" % (spec.getNS(), spec.getRS(), spec.getGmag(), os.linesep) outfile.writelines(header) fieldnames = ["wavelength", "flux density", "error"] writer = DictWriter(outfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(spec.lineDictList())
def namestring_dict_writer(output_dict: dict, path: str, filename: str, top_key: str = "namestring", sub_keys: List[str] = None) -> None: """ Writes a dictionary of: [ namestring : sub_dictionary ] or [ namestring : list ] or [ namestring : value ] to the disk as a CSV. top_key will be written as the first column. If the values of the output_dict are dict themselves, the column names will be generated by the sub_dict's keys. If sub_keys is passed, this will be overriden. Can be used for selecting values from each sub_dict, if desired. If the values are given as a list or single value and sub_keys list is not passed, no column header will be given. If the values of output_dict are singular, then they will be written by their str( value ) form. Similarly, if sub_keys is not passed, none will be written. Note: Only checks one entry to the output_dict value's format. Thus, the output_dict values are expected to all be of the same format. :param output_dict: namestring dictionary to write :param path: /path/to/write/to :param filename: filename.csv :param top_key: The title to give the top_key. Defaults to 'namestring'. Will be the first column written. :param sub_keys: A list of sub_keys to title the columns if in_dict is passed as a list :return: """ from fileio.utils import dirCheck, join from common.constants import os dirCheck(path) namekeys = list(output_dict.keys()) if type(output_dict[namekeys[0]]) == dict: sub_keys = sorted([ f"{sub_key}" for sub_key in output_dict[namekeys[0]].keys() ]) if sub_keys is None else sub_keys sub_string = lambda x: ''.join([f"{x[ k ]}," for k in sub_keys])[:-1] + os.linesep elif type(output_dict[namekeys[0]]) == list: sub_string = lambda x: ''.join(f'{item},' for item in x)[:-1] + os.linesep else: sub_string = lambda x: f"{x}{os.linesep}" with open(join(path, filename), 'w') as outfile: if sub_keys is not None: outfile.write(f'{top_key},' + ''.join([f"{sk}," for sk in sub_keys])[:-1] + os.linesep) outfile.writelines(f"{key},{sub_string( output_dict[ key ] )}" for key in namekeys)
def text_load(path: str, filename: str) -> Spectrum: """ Loads the standardized ASCII format Spectrum file as written by text_write. Note: If for some reason, the redshift and/or gmag values cannot be converted to a float, they will be assigned a value of -1 :param path: /path/to/input file :param filename: input file name :type path: str :type filename: str :return: Loaded Spectrum :rtype: Spectrum :raises: FileNotFoundError """ fileCheck(path, filename) with open(join(path, filename), 'r') as infile: """ Read header. File format: namestring=55555-4444-333,z=float(),gmag=float() wavelength,flux density,error Parse the first line, use the second as CSV reader input """ header = infile.readline().strip().split(',') namestring = fns(header[0]) try: z = float(header[1].strip("z=")) except ValueError: z = -1 try: gmag = float(header[2].strip("gmag=")) except ValueError: gmag = -1 reader = DictReader(infile, fieldnames=infile.readline().strip().split(',')) wls = [] flux = [] err = [] for row in reader: try: wls.append(int(row['wavelength'])) except ValueError: wls.append(float(row['wavelength'])) flux.append(float(row['flux density'])) err.append(float(row['error'])) spec = Spectrum(namestring=namestring, z=z, gmag=gmag) spec.setDict(wls, flux, err) return spec
def load(path: str, filename: str) -> Spectrum: """ Loads the serialized spectrum file at /path/filename :param path: /path/to/filename :param filename: file name of spectrum to load :type path: str :type filename: str :rtype: Spectrum :raises: FileNotFoundError """ fileCheck(path, filename) return pickle.load(open(join(path, filename), 'rb'))
def namestring_dict_reader(path: str, filename: str, top_key: str = "namestring", has_header: bool = True) -> dict: from fileio.utils import fileCheck, join fileCheck(path, filename) def num_conv(num: str) -> Union[float, int]: try: num = int(num) except ValueError: try: num = float(num) except ValueError: pass return num def form_dict(line: dict) -> dict: namestring = line.pop(top_key) for k, v in line.items(): line[k] = num_conv(v) return {namestring: dict(line)} def form_list_value(line: str) -> dict: line = line.strip().split(',') if (len(line) > 2): for i in range(1, len(line)): line[i] = num_conv(line[i]) return {line[0]: line[1:]} line[1] = num_conv(line[1]) return {line[0]: line[1]} with open(join(path, filename), 'r') as infile: outdict = {} reader = infile reader_func = None if (has_header): from csv import DictReader reader = DictReader( infile, fieldnames=infile.readline().strip().split(',')) reader_func = form_dict else: reader_func = form_list_value for line in reader: outdict.update(reader_func(line)) return outdict
def write(spec: Spectrum, path: str, filename: str) -> None: """ Writes a serialized spectrum file at /path/filename :param spec: spectrum to the written :param path: /path/to/filename :param filename: file name to be written to :type spec: Spectrum :type path: str :type filename: str :return: None """ dirCheck(path) with open(join(path, filename), 'wb') as outfile: pickle.dump(spec, outfile, protocol=pickle.HIGHEST_PROTOCOL)
def simple_list_reader(path: str, filename: str, valuespliter: Union[str, None] = ",") -> list: """ Reads in a simple list from a file. Will attempt to split each line by valuesplitter variable. Is capable of discerning between input types of int, float and str. Will evaluate to these types accordingly. If the length of line.split( valuesplitter ) == 1, returns a simple list of values. If that length is greater than one, the entry will be a tuple of all the individual values. :param path: /path/to/filename :param filename: name of the file :param valuespliter: value to split the line by. Defaults to a comma "," If you need to ensure the line is NOT split, enter valuesplitter = None :type path: str :type filename: str :type valuespliter: str or NoneType :return: List of file lines :rtype: list """ from fileio.utils import fileCheck, join # Use this method to determine subtypes and assign accordingly # i.e. firgure out if it's an int, float, or string # If it can't make value one of those three terms, it throws an error def __get_type(value) -> type: types = [int, float, str] for t in types: try: t(value) return t except ValueError: continue raise ValueError(f"Unable to determine type if input value: { value }") fileCheck(path, filename) outlist = [] with open(join(path, filename), 'r') as infile: for line in infile: line = line.strip().split(valuespliter) for i in range(len(line)): line[i] = __get_type(line[i].strip())(line[i]) if len(line) == 1: line = line[0] else: line = tuple(line) outlist.append(line) return outlist
def __fix_outpath( path: str, filename: str = None ) -> str: """ Simple wrapper to ensure that the output path sent to Gnuplot is properly formatted. The path returned works in both Linux and Windows systems. It's the result of not just passing a path to an open/close operation, but passing a string to a method to a wrapper to a pipe to Gnuplot. Automatically inserts the quotations around the "filename" in the string. i.e. the gnuplot command is sent properly as set output "/path/to/file name" which takes a few extra backslashes so that the string doesn't just see escape characters the whole way. :param path: /path/to/filename :type path: str :param filename: output file name :type filename: str :return: File path formatted in a way that Gnuplot.py will properly find the correct file :rtype: str """ from fileio.utils import join path = join( path, filename ) if filename is not None else path return '"' + ''.join( [ r"\\" if char == "\\" else char for char in path ] ) + '"'
def fit_spec_loader(path: str, filename: str, mask_dict: dict = DEF_MASK_DICT) -> Spectrum: """ Loads a FIT spectrum file from SDSS DR 7 or lower. Converts it into Spectrum type. Note: error_dict has the actual mask values as keys. Loader will iterate through these keys and delete any points where these keys are found. The dict format is an artifact where the values attached to each key are the SDSS error names in text. :param path: /path/to/file :param filename: filename.fits :param mask_dict: Defaults to DEF_ERR_DICT defined in this file if not passed :type path: str :type filename: str :type mask_dict: dict :rtype: Spectrum """ from astropy.io.fits import getheader, getdata from fileio.utils import fileCheck, join from catalog import shenCat fileCheck(path, filename) shenCat.load() infile = join(path, filename) # Assemble basic info from the header # Check if the HW redshift is included in the shenCat. If so, assign it, # otherwise use the one in the file header = getheader(infile, 0) namestring = "%05i-%04i-%03i" % (header['MJD'], header['PLATEID'], header['FIBERID']) z = shenCat.subkey(namestring, 'z') if namestring in shenCat else float( header['z']) gmag = float(header['MAG'].split()[1]) # Stored as UGRIZ data = getdata(infile, 0) flux_data = data[0].tolist( ) # first apertrure is the calibrated spectrum flux density # data[ 1 ] is the continuum-subtracted spectrum. Not of interest err_data = data[2].tolist() # third is the +/- of flux denisty mask_data = data[3].tolist() # error mask # Wavelength values are not stored in FIT files. Only three values are available, and these are used to # generate the wavelengths which correspond to the pixels # i.e. wl[ pixel 0 ] -> flux density[ 0 ], error[ 0 ], mask[ 0 ], etc # # Those 3 values are: # naxis1 : number of pixels stored # coeff0 : Log10 of the first wavelength # coeff1 : Log10 of the dispersion coefficient # # Log10( wavelengths ) are generated by the function: log_wl_n( n ) = c0 + c1 * n # where n is the nth pixel # Then the wavelength, in angstroms is given 10^(log_wl_n) c0 = header['coeff0'] c1 = header['coeff1'] num_pixels = header['naxis1'] # The actual wavelength generation happens here wavelengths = [pow(10, c0 + c1 * n) for n in num_pixels] out_spec = Spectrum(namestring=namestring, z=z, gmag=gmag) out_spec.setDict(wavelengths, flux_data, err_data) # Mask out the errors for i in range(len(err_data)): if __bit_mask(mask_data[i], mask_dict): del out_spec[wavelengths[i]] return out_spec
def main(): """ These were some spectra with interesting results from the full catalog search. Their namestring dicts are kept in CSV files in the folder stored at interestpath below. For each file in that folder this process will: 1. Load the CSV into a namestring dict 2. Load both the primary spectrum from the matches (given by the file name) and the matching spectra from the namestring dict keys - scaled to the primary 3. Form a composite spectrum out of all the Spectrum objects 4. Fit a log function to the initial namestring dict results 5. Create an output folder given by the primary namestring 6. Plot the composite spectrum generated 7. Plot an AB v Z graph of the initial namestring dict results with the fit line 8. Plot a 4 panel chart of the primary spectrum and its initial matches 9. Generate a chi^2 pipeline for the composite, run it against the entire catalog, reduce the results 10. Join the results with data in shenCat to a namestring dict, write them to a CSV with the plots 11. Fit the results to a log function 12. Plot a 4-panel chart and an AB v Z chart for the composite and matching results This should provide a pretty good step-by-step example of a multitude of that the library can do, and how its moving parts can all come together. It's a good idea to step through this by debugging it. """ interestpath = join(BASE_ANALYSIS_PATH, "Matches of Interest") outpath = join(interestpath, "Analysis") max_chi = 0.5 # define the function we'll be fitting to. def fit_func(x): from numpy import log10 return a * log10(x) + b # Get the files of interest filelist = getFiles(interestpath) # Start looping through them for f in filelist: # Print a line to the console saying which file we're working on. This method doesn't put a carriage return # at the end, so when this process completes, we can print "Done." on the same line at the end. unfinished_print(f"Analyzing { fns(f) } matches...") # Read in the namestring dict of the initial matches indict = namestring_dict_reader(interestpath, f) # load the primary spectrum and the matches from the namestring dict. Scale them. Add the primary to the # group of spectra being used to form a composite prime = rspecLoader(fns(f)) speclist = async_rspec_scaled(indict, prime) speclist.insert(0, prime) # define the output path for this set of data by just using the primary spectrum's namestring specoutpath = join(outpath, prime.getNS()) dirCheck( specoutpath ) # if the folder doesn't yet exist (it doesn't), create it. Otherwise things will try to # write to folders than don't exist. Clearly, that'd be a problem. # form the composite and write both its binary and textual format to the output folder composite = compose_speclist(speclist, f"Composite { prime.getNS() }") write(composite, specoutpath, "composite.cspec") text_write(composite, specoutpath, "composite.text_cspec") del speclist # Since we aren't exiting some method, python's garbage collector won't be triggered all that often # and large lists of large data are left to float about for a while. This often results in the kernel dumping # a lot of data that won't be used again into the swap space on the hard drive - which is slow, but the kernel # doesn't know any better, since this is just looping. Nothing to say, "we're done with this data." It's easier # to just delete large arrays that won't be used again and eliminate that lost time, plus save up some swap. # Fit the log function, get constants and their uncertainties. Just unpack the tuple directly and form a string # using those values. This will go onto the AB v Z plots so we can see they actually are. (a, b), (a_uncert, b_uncert) = generic_log10_fit( [shenCat[ns]['z'] for ns in indict], [shenCat[ns]['ab'] for ns in indict], True) fit_str = f"%0.2f { PM } %0.2f log10( x ) { '+' if b > 0 else '-' } %0.2f { PM } %0.2f" % ( a, b, a_uncert, b_uncert) # Plot the composite composite.plot(specoutpath) # Plot the 4-panel chart of scaled matches to the primary (the ones that formed the composite), along with their # AB v Z, its fit, and since a primary was specified, the ab_v_z_plot will generate a modeled evolution as well # using tools.cosmo four_panel_multiplot(specoutpath, f"{prime.getNS()} Multiplot.pdf", prime, speclist, f"Matches to { prime.getNS() }") ab_z_plot(specoutpath, f"{ prime.getNS() } AB v Z.pdf", speclist, prime, plotTitle=f" Matches to { prime.getNS() } {PM} 2 Sigma", rs_fit_func=fit_func, rs_fit_title=fit_str) # Get a chi^2 copy of an analysis_pipeline. This method prebuilds a pipe specifically for chi^2 matching. # It's easier to just let the wrapper do it. A raw pipe can also be built by using the class directly, but # the idea of loading and scaling, etc, can all be handled here. This method returns a pipeline ready to do so. # You're welcome. pipeline = get_chi_analysis_pipeline(composite, shenCat, CONT_RANGE, max_chi) # So make it go. This method - especially in this case where it's matching against the entire catalog - will # hit your processor as hard as it reasonably can (it's limited to a maximum number of processes equal to the # number of logical CPU cores you have. The system won't lock or anything, but a lot of RAM will be used # and a lot of heat will be generated. This will take ESPECIALLY long if you're running in DEBUG mode. Go get # a cup of coffee or tell Dr. Monier some jokes. Better yet - read that long write up I wrote for you. Decode # my cryptic/manic ravings. If you find typos, keep them to yourself. pipeline.do_analysis() # Now that the process is complete, cut down the number of results. We've initialized the pipe to dump # everything that returns a chi^2 value > 0.5 (max_chi variable at the top). If you don't reduce the results # and instead just call get_results() you'll get a namestring dict of ALL the results of ALL values. That's not # a bad thing if you want to see what all the data looks like, but here it's not needed. results = pipeline.reduce_results() # These results are in the form { namestring : float }. Give them their own key and position in a subdict # containing all the useful info in shenCat. This way, when looing at the CSV later on, all the data is there. # This little method is pretty useful for pipeline results in that way. results = join_with_shen_cat(results, 'chi') # Write the namestring dict to the disk in a CSV format. namestring_dict_writer(results, specoutpath, "composite matches.csv") # Fit the resulting data, make the fit string for the plot, and plot it all (a, b), (a_uncert, b_uncert) = generic_log10_fit( [shenCat[ns]['z'] for ns in indict], [shenCat[ns]['ab'] for ns in indict], True) fit_str = f"%0.2f { PM } %0.2f log10( x ) { '+' if b > 0 else '-' } %0.2f { PM } %0.2f" % ( a, b, a_uncert, b_uncert) ab_z_plot(specoutpath, "Composite AB Z.pdf", results, plotTitle=f"Composite Matches: Maximum {CHI} = {max_chi}", rs_fit_func=fit_func, rs_fit_title=fit_str) # Will need to load up the spectra that matched for the 4-panel plot. This method both loades them # up and scales them to the spectrum they're given. The sort_iterable... one returns a list of namestrings # sorted by redshift (by default - can use ther shenCat keys if desired). # The scale_to factor in async_rsepc_scaled can also be a flux density value if desired results = async_rspec_scaled(sort_iterable_by_shen_key(results), scale_to=composite) four_panel_multiplot( specoutpath, f"Composite Matches Multiplot.pdf", composite, results, f"Matches to Composite Spectrum: Maximum {CHI} = {max_chi} ") done() # print a pretty "Done." to the console # Be proactive with your memory. Real Python programmers would probably hate that I do this. They'd tell me # to wrap all this into a submethod and call that and let the garbage collector handle it all. They're probably # right, but oh well. I've already gone this far. del results, pipeline