def from_gwu_chem_UVVIS(filelist, sortnames=False, shortname=True, cut_extension=False, name=''): ''' Format for comma delimited two column data from GWU chemistry's UVVis. These have no useful metadata or dark data and so it is important that users either pass in a correctly sorted filelist. Once the dataframe is created, on can do df=df.reindex(columns=[correct order]). It uses read_csv() to and creates a list of dataframes. Afterwards, concat() merges these. Kwds: sortnames- Will attempt to autosort the filelist. Otherwise, order of files passed in is directly used as columns. shortname- If false, full file path is used as the column name. If true, only the filename is used. cut_extension- If using the shortname, this will determine if the file extension is saved or cut from the data.''' if shortname: fget = lambda x: get_shortname(x, cut_extension=cut_extension) else: fget = lambda x: x ### Either full names or short names of filelist working_names = [fget(afile) for afile in filelist] dflist = [ read_csv( afile, sep=',', header=None, index_col=0, skiprows=2, na_values=' ', #Used to be ' \r', or is this from IR? names=[fget(afile)]) for afile in filelist ] ### THIS IS BUSTED, PUTTING NANS EVERYWHERE EXCEPT ONE FILE, but dflist itself ws nice. dataframe = concat(dflist, axis=1) ### concat tries to sort these, so this will preserve the sort order if sortnames: dataframe = dataframe.reindex(columns=sorted(working_names)) dataframe = TimeSpectra(dataframe) #this is fine dataframe.metadata = None dataframe.filedict = None dataframe.baseline = None dataframe.specunit = 'nm' #This autodetected in plots if name: dataframe.name = name return dataframe
def from_gwu_chem_UVVIS(filelist, sortnames=False, shortname=True, cut_extension=False, name=''): ''' Format for comma delimited two column data from GWU chemistry's UVVis. These have no useful metadata or dark data and so it is important that users either pass in a correctly sorted filelist. Once the dataframe is created, on can do df=df.reindex(columns=[correct order]). It uses read_csv() to and creates a list of dataframes. Afterwards, concat() merges these. Kwds: sortnames- Will attempt to autosort the filelist. Otherwise, order of files passed in is directly used as columns. shortname- If false, full file path is used as the column name. If true, only the filename is used. cut_extension- If using the shortname, this will determine if the file extension is saved or cut from the data.''' if shortname: fget=lambda x:get_shortname(x, cut_extension=cut_extension) else: fget=lambda x: x ### Either full names or short names of filelist working_names=[fget(afile) for afile in filelist] dflist=[read_csv(afile, sep=',', header=None, index_col=0, skiprows=2, na_values=' ', #Used to be ' \r', or is this from IR? names=[fget(afile)]) for afile in filelist] ### THIS IS BUSTED, PUTTING NANS EVERYWHERE EXCEPT ONE FILE, but dflist itself ws nice. dataframe=concat(dflist, axis=1) ### concat tries to sort these, so this will preserve the sort order if sortnames: dataframe=dataframe.reindex(columns=sorted(working_names)) dataframe=TimeSpectra(dataframe) #this is fine dataframe.metadata=None dataframe.filedict=None dataframe.baseline=None dataframe.specunit='nm' #This autodetected in plots if name: dataframe.name=name return dataframe
def from_spec_files(file_list, name='', skiphead=17, skipfoot=1, check_for_overlapping_time=True, extract_dark=True): ''' Takes in raw files directly from Ocean optics USB2000 and USB650 spectrometers and returns a skspec TimeSpectra. If spectral data stored without header, can be called with skiphead=0. Parameters ---------- name: Set name of returned TimeSpectra. check_for_overlapping_time: will raise errors if any files have identical times. Otherwise, time is overwritten. Really only useful for testing or otherwise cornercase instances. extract_dark: Attempt to find a filename with caseinsenstive string match to "dark". If dark spectrum not found, will print warning. If multiple darks found, will raise error. skiphead/skipfoot: Mostly for reminder that this filetype has a 17 line header and a 1 line footer. Notes ----- Built to work with 2-column data only!!! Dataframe is constructed from a list of dictionaries. Each dataframe gets an appended headerdata attribute (dataframe.headerdata) which is a dictionary, keyed by columns and stores (infile, header, footer) data so no info is lost between files. Constructed to work for non-equally spaced datafiles, or non-identical data (aka wavelengths can have nans). ''' dict_of_series={} #Dict of series eventually merged to dataframe time_file_dict={} #Dict of time:filename (darkfile intentionally excluded) _overlap_count = 0 # Tracks if overlapping occurs ### If looking for a darkfile, this will find it. Bit redundant but I'm lazy..### if extract_dark: darkfile=extract_darkfile(file_list, return_null=True) if darkfile: with open(darkfile) as f: header=[f.next().strip() for x in xrange(skiphead)] wavedata=np.genfromtxt(darkfile, dtype=spec_dtype, skip_header=skiphead, skip_footer=skipfoot) darktime=_get_datetime_specsuite(header) baseline=Series(wavedata['intensity'], index=wavedata['wavelength'], name=darkfile) file_list.remove(darkfile) f.close() else: baseline=None file_list = [f for f in file_list if os.path.basename(f) != '.gitignore'] for infile in file_list: ###Read in only the header lines, not all the lines of the file ###Strips and splits in one go with open(infile) as f: header=[f.next().strip() for x in xrange(skiphead)] #Store wavelength, intensity data in a 2-column datatime for easy itemlookup #Eg wavedata['wavelength'] wavedata=np.genfromtxt(infile, dtype=spec_dtype, skip_header=skiphead, skip_footer=skipfoot) # Extract time data from header datetime=_get_datetime_specsuite(header) if datetime in time_file_dict: _overlap_count += 1 # Make sure timepoints aren't overlapping with any others if check_for_overlapping_time and _overlap_count: raise IOError('Duplicate time %s found in between files %s, %s.' ' To overwrite, set check_for_overlapping_time = False.' %( datetime, infile, time_file_dict[datetime] )) time_file_dict[datetime]=infile dict_of_series[datetime]=Series(wavedata['intensity'], index=wavedata['wavelength']) f.close() ### Make timespec, add filenames, baseline and metadata attributes (note, DateTimeIndex auto sorts!!) timespec=TimeSpectra(DataFrame(dict_of_series), name=name) #Dataframe beacuse TS doesn't handle dict of series timespec.specunit='nm' timespec.filedict=time_file_dict timespec.baseline=baseline #KEEP THIS AS DARK SERIES RECALL IT IS SEPARATE FROM reference OR REFERENCE.. ### Take metadata from first file in filelist that isn't darkfile for infile in file_list: if infile != darkfile: with open(infile) as f: header=[f.next().strip() for x in xrange(skiphead)] meta_partial=_get_metadata_fromheader(header) break meta_general=get_headermetadata_dataframe(timespec, time_file_dict) meta_general.update(meta_partial) timespec.metadata=meta_general if _overlap_count: logger.warn('Time duplication found in %s of %s files. Duplicates were ' 'removed!' % (_overlap_count, len(file_list))) return timespec
def from_timefile_datafile(datafile, timefile, extract_dark=True, name=''): ''' Converts old-style spectral data from GWU phys lab into a dataframe with timestamp column index and wavelength row indicies. Creates the DataFrame from a dictionary of Series, keyed by datetime. **name becomes name of dataframe''' tlines=open(timefile,'r').readlines() tlines=[line.strip().split() for line in tlines] tlines.pop(0) time_file_dict=dict((_get_datetime_timefile(tline),tline[0]) for tline in tlines) ### Read in data matrix, separate first row (wavelengths) from the rest of the data wavedata=np.genfromtxt(datafile, dtype='float', skip_header=1) data, wavelengths=wavedata[:,1::], wavedata[:,0] #Separate wavelength column ### Sort datetimes here before assigning/removing dark spec etc... sorted_tfd=sorted(time_file_dict.items()) sorted_times, sorted_files=zip(*( (((i[0]), (i[1])) for i in sorted_tfd))) ### Seek darkfile. If found, take it out of dataframe. ### if extract_dark: darkfile=extract_darkfile(sorted_files, return_null=True) if darkfile: ####Find baseline by reverse lookup (lookup by value) and get index position #darkindex, darktime=[(idx, time) for idx, (time, afile) in enumerate(sorted_tfd) if afile == darkfile][0] darkindex=sorted_files.index(darkfile) darktime=sorted_times[darkindex] baseline=Series(data[:,darkindex], index=wavelengths, name=darkfile) del time_file_dict[darktime] #Intentionally remove sorted_times=list(sorted_times) #Need to do in two steps sorted_times.remove(darktime) data=np.delete(data, darkindex, 1) #Delete dark column from numpy data else: baseline=None dataframe=TimeSpectra(data, columns=sorted_times, index=wavelengths) ### Add field attributes to dataframe dataframe.baseline=baseline dataframe.filedict=time_file_dict if name: dataframe.name=name ### Get headermeta data from first line in timefile that isn't darkfile. Only checks one line ### Does not check for consistency for line in tlines: if line[0]==darkfile: pass else: meta_partial=_get_headermetadata_timefile(line[0]) #DOUBLE CHECK THIS WORKS break ### Extract remaining metadata (file/time info) and return ### meta_general=get_headermetadata_dataframe(dataframe, time_file_dict) meta_general.update(meta_partial) dataframe.metadata=meta_general dataframe.specunit='nm' #This autodetected in plots ### Sort dataframe by ascending time (could also sort spectral data) ### dataframe.sort(axis=1, inplace=True) #axis1=columns return dataframe
def from_timefile_datafile(datafile, timefile, extract_dark=True, name=''): ''' Converts old-style spectral data from GWU phys lab into a dataframe with timestamp column index and wavelength row indicies. Creates the DataFrame from a dictionary of Series, keyed by datetime. **name becomes name of dataframe''' tlines = open(timefile, 'r').readlines() tlines = [line.strip().split() for line in tlines] tlines.pop(0) time_file_dict = dict( (_get_datetime_timefile(tline), tline[0]) for tline in tlines) ### Read in data matrix, separate first row (wavelengths) from the rest of the data wavedata = np.genfromtxt(datafile, dtype='float', skip_header=1) data, wavelengths = wavedata[:, 1::], wavedata[:, 0] #Separate wavelength column ### Sort datetimes here before assigning/removing dark spec etc... sorted_tfd = sorted(time_file_dict.items()) sorted_times, sorted_files = zip(*((((i[0]), (i[1])) for i in sorted_tfd))) ### Seek darkfile. If found, take it out of dataframe. ### if extract_dark: darkfile = extract_darkfile(sorted_files, return_null=True) if darkfile: ####Find baseline by reverse lookup (lookup by value) and get index position #darkindex, darktime=[(idx, time) for idx, (time, afile) in enumerate(sorted_tfd) if afile == darkfile][0] darkindex = sorted_files.index(darkfile) darktime = sorted_times[darkindex] baseline = Series(data[:, darkindex], index=wavelengths, name=darkfile) del time_file_dict[darktime] #Intentionally remove sorted_times = list(sorted_times) #Need to do in two steps sorted_times.remove(darktime) data = np.delete(data, darkindex, 1) #Delete dark column from numpy data else: baseline = None dataframe = TimeSpectra(data, columns=sorted_times, index=wavelengths) ### Add field attributes to dataframe dataframe.baseline = baseline dataframe.filedict = time_file_dict if name: dataframe.name = name ### Get headermeta data from first line in timefile that isn't darkfile. Only checks one line ### Does not check for consistency for line in tlines: if line[0] == darkfile: pass else: meta_partial = _get_headermetadata_timefile( line[0]) #DOUBLE CHECK THIS WORKS break ### Extract remaining metadata (file/time info) and return ### meta_general = get_headermetadata_dataframe(dataframe, time_file_dict) meta_general.update(meta_partial) dataframe.metadata = meta_general dataframe.specunit = 'nm' #This autodetected in plots ### Sort dataframe by ascending time (could also sort spectral data) ### dataframe.sort(axis=1, inplace=True) #axis1=columns return dataframe
def from_spec_files(file_list, name='', skiphead=17, skipfoot=1, check_for_overlapping_time=True, extract_dark=True): ''' Takes in raw files directly from Ocean optics USB2000 and USB650 spectrometers and returns a skspec TimeSpectra. If spectral data stored without header, can be called with skiphead=0. Parameters ---------- name: Set name of returned TimeSpectra. check_for_overlapping_time: will raise errors if any files have identical times. Otherwise, time is overwritten. Really only useful for testing or otherwise cornercase instances. extract_dark: Attempt to find a filename with caseinsenstive string match to "dark". If dark spectrum not found, will print warning. If multiple darks found, will raise error. skiphead/skipfoot: Mostly for reminder that this filetype has a 17 line header and a 1 line footer. Notes ----- Built to work with 2-column data only!!! Dataframe is constructed from a list of dictionaries. Each dataframe gets an appended headerdata attribute (dataframe.headerdata) which is a dictionary, keyed by columns and stores (infile, header, footer) data so no info is lost between files. Constructed to work for non-equally spaced datafiles, or non-identical data (aka wavelengths can have nans). ''' dict_of_series = {} #Dict of series eventually merged to dataframe time_file_dict = { } #Dict of time:filename (darkfile intentionally excluded) _overlap_count = 0 # Tracks if overlapping occurs ### If looking for a darkfile, this will find it. Bit redundant but I'm lazy..### if extract_dark: darkfile = extract_darkfile(file_list, return_null=True) if darkfile: with open(darkfile) as f: header = [f.next().strip() for x in xrange(skiphead)] wavedata = np.genfromtxt(darkfile, dtype=spec_dtype, skip_header=skiphead, skip_footer=skipfoot) darktime = _get_datetime_specsuite(header) baseline = Series(wavedata['intensity'], index=wavedata['wavelength'], name=darkfile) file_list.remove(darkfile) f.close() else: baseline = None file_list = [f for f in file_list if os.path.basename(f) != '.gitignore'] for infile in file_list: ###Read in only the header lines, not all the lines of the file ###Strips and splits in one go with open(infile) as f: header = [f.next().strip() for x in xrange(skiphead)] #Store wavelength, intensity data in a 2-column datatime for easy itemlookup #Eg wavedata['wavelength'] wavedata = np.genfromtxt(infile, dtype=spec_dtype, skip_header=skiphead, skip_footer=skipfoot) # Extract time data from header datetime = _get_datetime_specsuite(header) if datetime in time_file_dict: _overlap_count += 1 # Make sure timepoints aren't overlapping with any others if check_for_overlapping_time and _overlap_count: raise IOError( 'Duplicate time %s found in between files %s, %s.' ' To overwrite, set check_for_overlapping_time = False.' % (datetime, infile, time_file_dict[datetime])) time_file_dict[datetime] = infile dict_of_series[datetime] = Series(wavedata['intensity'], index=wavedata['wavelength']) f.close() ### Make timespec, add filenames, baseline and metadata attributes (note, DateTimeIndex auto sorts!!) timespec = TimeSpectra( DataFrame(dict_of_series), name=name) #Dataframe beacuse TS doesn't handle dict of series timespec.specunit = 'nm' timespec.filedict = time_file_dict timespec.baseline = baseline #KEEP THIS AS DARK SERIES RECALL IT IS SEPARATE FROM reference OR REFERENCE.. ### Take metadata from first file in filelist that isn't darkfile for infile in file_list: if infile != darkfile: with open(infile) as f: header = [f.next().strip() for x in xrange(skiphead)] meta_partial = _get_metadata_fromheader(header) break meta_general = get_headermetadata_dataframe(timespec, time_file_dict) meta_general.update(meta_partial) timespec.metadata = meta_general if _overlap_count: logger.warn( 'Time duplication found in %s of %s files. Duplicates were ' 'removed!' % (_overlap_count, len(file_list))) return timespec