def import_datafile(self, fn, rec, redo=False): """ data file -> sample(s) """ if "NUPM-MON" in rec.basename: raise SparrowImportError("NUPM-MON files are not handled yet") if not rec.csv_data: raise SparrowImportError("CSV data not extracted") try: data, meta = decode_datatable(rec.csv_data) self.meta = meta data.index.name = 'analysis' except IndexError as err: raise SparrowImportError(err) data = generalize_samples(data) sample_names = list(data.index.unique(level=0)) if self.verbose: echo("Samples: " + ", ".join(sample_names)) for sample_name in sample_names: df = _sample_dataframe(data, sample_name) try: yield self.import_session(rec, df) except (IntegrityError, ProgrammingError, DataError) as err: raise SparrowImportError(str(err.orig)) # Handle common error types except (IndexError, ValueError, AssertionError, TypeError) as err: raise SparrowImportError(err)
def encode_datatable(infile): try: wb = get_excel_reader(infile) df = read_excel(wb, sheet_name="datatable", header=None) except XLRDError: if "AGE PICK" in infile.stem: raise NotImplementedError("AGE PICK files are not yet handled") raise SparrowImportError("No data table") except AssertionError: raise SparrowImportError("Could not open data table") b = StringIO() df.to_csv(b, compression='gzip') b.seek(0) # Convert to a binary representation return b.read().encode()
def import_analysis(self, row, session): """ row -> analysis """ # session index should not be nan try: ix = int(row.name[2]) except ValueError: ix = None analysis = self.add_analysis(session, session_index=ix, analysis_name=str(row.name[1])) for i in row.iteritems(): try: d = self.import_datum(analysis, *i, row) except (ValueError, AttributeError) as err: # Correct thing to do: raise SparrowImportError. # This tells the application that you explicitly handled # this error, and to report it without stopping. raise SparrowImportError(err) if d is None: continue yield d
def normalize_data(df): if df.iloc[0,0].startswith("Table"): df = df[1:] # Drop empty rows df = df.dropna(how='all') # Drop columns that are empty in the header # Note: this does not preserve comments and some other metadata; # we may want to. header = df.iloc[1:3, 1:] # Make sure second row of header (chiefly units) is set to null # if first row is null (this helps get rid of trailing end matter) header.iloc[1] = header.iloc[1].mask(header.iloc[0].isnull()) header = header.dropna(axis=1, how='all') meta = table_metadata(header) body = df.iloc[3:].set_index(df.columns[0]) body.index.name = 'Analysis' # Make sure data is the same shape as headers data = (body.drop(body.columns.difference(header.columns), 1) .dropna(how='all')) # We've found a few empty data frames if data.empty: raise SparrowImportError('Empty dataframe') try: assert data.shape[1] == meta.shape[1] except AssertionError: raise SparrowImportError('Data frame is not correct shape') # For some reason we have a lot of these closed brackets in data files data.index = data.index.str.replace(' <>','').str.strip() data.columns = meta.columns ncols = 19 if data.shape[1] == ncols: return data, meta if len(data.columns[:ncols].intersection(data.columns[ncols:])) > 0: secho("Ignoring duplicate output columns.") data = data.iloc[:,:ncols] meta = meta.iloc[:,:ncols] return data, meta
def import_datafile(self, fn, rec, **kwargs): """ Import an original data file """ # Extract data tables from Excel sheet # File modification time is right now the best proxy # for creation date (note: sessions will be duplicated # if input files are changed) mod_time = datetime.fromtimestamp(path.getmtime(fn)) try: incremental_heating, info, results = extract_data_tables(fn) except Exception as exc: raise SparrowImportError(str(exc)) if self.show_data: print_dataframe(incremental_heating) print_dataframe(info) print_dataframe(results.transpose()) sample = self.sample(name=info.pop('Sample')) target = self.material(info.pop('Material')) instrument = self.db.get_or_create(self.m.instrument, name="MAP 215-50") method = self.method("Ar/Ar " + info.pop("Type")) self.add(sample, target, instrument, method) self.db.session.flush() session = self.db.get_or_create(self.m.session, sample_id=sample.id, instrument=instrument.id, technique=method.id, date=mod_time, target=target.id) session.date_precision = "day" self.add(session) self.db.session.flush() info = self.general_info(session, info) session.data = info.to_dict() for i, step in enumerate(incremental_heating.iterrows()): self.import_heating_step(i, step, session, incremental_heating) # Import results table try: res = results.loc["Age Plateau"] self.import_age_plateau(session, res) except KeyError: pass res = results.loc["Total Fusion Age"] self.import_fusion_age(session, res) # This function returns the top-level # record that should be linked to the datafile self.db.session.flush() yield session
def get_excel_reader(infile): try: if isinstance(infile, IOBase): # We have an in-memory file fc = infile.read() return open_workbook(file_contents=fc, on_demand=True) else: # We have a filename string return open_workbook(infile, on_demand=True) except Exception as err: raise SparrowImportError(str(err))
def generalize_samples(input): """Generalize sample ids into `sample_name`, `analysis_name`, and `session_index` columns""" # Create sample name columns data = input.reset_index() data.rename(columns={'Analysis': 'analysis'}, inplace=True) # Strip out extra data data['analysis'] = data['analysis'].str.strip(delimiters) data['analysis_name'] = data['analysis'].apply(extract_analysis_name) # Strip the analysis suffix off of the sample ID data['sample_name'] = data.apply(strip_analysis_name, axis=1) for sample_name, group in data.groupby(["sample_name"]): unique_suffix = group['analysis_name'].unique() # If we don't have enough unique suffixes, it's probable that we actually # grabbed part of the sample ID. In that case, we fall back to the # original sample id ix = data['sample_name'] == sample_name if len(unique_suffix) / len(group) < 0.4: # Not many of our analysis names are unique, so we fall back # to dealing with samples without internal enumeration data.loc[ix, 'sample_name'] = data.loc[ix, 'analysis'] data.loc[ix, 'analysis_name'] = None if sample_name.startswith('Spot'): # It appears we don't have a sample name, instead data.loc[ix, 'sample_name'] = None data.loc[ix, 'analysis_name'] = data.loc[ix, 'analysis'] n_samples = len(data['sample_name'].unique()) if n_samples > 0.3 * len(data) and n_samples > 20: # We have a lot of "unique" samples even though we tried to extract # sample IDs. We are probably doing something wrong. raise SparrowImportError("Too many unique samples; skipping import.") # Session index is extracted if an integer can be found easily cleaned_name = data['analysis_name'].str.replace("Spot", "").str.strip(delimiters) data['session_index'] = to_numeric(cleaned_name, errors='coerce', downcast='integer') print_sample_info(data, verbose=True) return data.set_index(["sample_name", "analysis_name", "session_index"], drop=True)
def check_matching_irradiation(sample, row): # Check that irradiation matches try: session = sample.session_collection[0] except IndexError: # Sample is not linked to any sessions, and that's OK! return irr = session.get_attribute("Irradiation ID") # Sessions should not have two Irradiation IDs assert len(irr) == 1 v = irr[0].value v1 = str(row['Irradiation']) if not v.startswith(v1): raise SparrowImportError(f"Irradiation mismatch") print(f" Irradiation {v}")