Пример #1
0
    def import_datafile(self, fn, rec, redo=False):
        """
        data file -> sample(s)
        """
        if "NUPM-MON" in rec.basename:
            raise SparrowImportError("NUPM-MON files are not handled yet")
        if not rec.csv_data:
            raise SparrowImportError("CSV data not extracted")

        try:
            data, meta = decode_datatable(rec.csv_data)
            self.meta = meta
            data.index.name = 'analysis'
        except IndexError as err:
            raise SparrowImportError(err)

        data = generalize_samples(data)

        sample_names = list(data.index.unique(level=0))

        if self.verbose:
            echo("Samples: " + ", ".join(sample_names))

        for sample_name in sample_names:
            df = _sample_dataframe(data, sample_name)
            try:
                yield self.import_session(rec, df)
            except (IntegrityError, ProgrammingError, DataError) as err:
                raise SparrowImportError(str(err.orig))
            # Handle common error types
            except (IndexError, ValueError, AssertionError, TypeError) as err:
                raise SparrowImportError(err)
Пример #2
0
def encode_datatable(infile):
    try:
        wb = get_excel_reader(infile)
        df = read_excel(wb, sheet_name="datatable", header=None)
    except XLRDError:
        if "AGE PICK" in infile.stem:
            raise NotImplementedError("AGE PICK files are not yet handled")
        raise SparrowImportError("No data table")
    except AssertionError:
        raise SparrowImportError("Could not open data table")

    b = StringIO()
    df.to_csv(b, compression='gzip')
    b.seek(0)
    # Convert to a binary representation
    return b.read().encode()
Пример #3
0
    def import_analysis(self, row, session):
        """
        row -> analysis
        """
        # session index should not be nan
        try:
            ix = int(row.name[2])
        except ValueError:
            ix = None

        analysis = self.add_analysis(session,
                                     session_index=ix,
                                     analysis_name=str(row.name[1]))

        for i in row.iteritems():
            try:
                d = self.import_datum(analysis, *i, row)
            except (ValueError, AttributeError) as err:
                # Correct thing to do: raise SparrowImportError.
                # This tells the application that you explicitly handled
                # this error, and to report it without stopping.
                raise SparrowImportError(err)
            if d is None:
                continue
            yield d
Пример #4
0
def normalize_data(df):
    if df.iloc[0,0].startswith("Table"):
        df = df[1:]

    # Drop empty rows
    df = df.dropna(how='all')

    # Drop columns that are empty in the header
    # Note: this does not preserve comments and some other metadata;
    # we may want to.
    header = df.iloc[1:3, 1:]
    # Make sure second row of header (chiefly units) is set to null
    # if first row is null (this helps get rid of trailing end matter)
    header.iloc[1] = header.iloc[1].mask(header.iloc[0].isnull())
    header = header.dropna(axis=1, how='all')
    meta = table_metadata(header)

    body = df.iloc[3:].set_index(df.columns[0])
    body.index.name = 'Analysis'
    # Make sure data is the same shape as headers
    data = (body.drop(body.columns.difference(header.columns), 1)
                .dropna(how='all'))

    # We've found a few empty data frames
    if data.empty:
        raise SparrowImportError('Empty dataframe')

    try:
        assert data.shape[1] == meta.shape[1]
    except AssertionError:
        raise SparrowImportError('Data frame is not correct shape')

    # For some reason we have a lot of these closed brackets in data files
    data.index = data.index.str.replace(' <>','').str.strip()
    data.columns = meta.columns

    ncols = 19
    if data.shape[1] == ncols:
        return data, meta

    if len(data.columns[:ncols].intersection(data.columns[ncols:])) > 0:
        secho("Ignoring duplicate output columns.")
        data = data.iloc[:,:ncols]
        meta = meta.iloc[:,:ncols]

    return data, meta
    def import_datafile(self, fn, rec, **kwargs):
        """
        Import an original data file
        """
        # Extract data tables from Excel sheet

        # File modification time is right now the best proxy
        # for creation date (note: sessions will be duplicated
        # if input files are changed)
        mod_time = datetime.fromtimestamp(path.getmtime(fn))

        try:
            incremental_heating, info, results = extract_data_tables(fn)
        except Exception as exc:
            raise SparrowImportError(str(exc))
        if self.show_data:
            print_dataframe(incremental_heating)
            print_dataframe(info)
            print_dataframe(results.transpose())

        sample = self.sample(name=info.pop('Sample'))
        target = self.material(info.pop('Material'))
        instrument = self.db.get_or_create(self.m.instrument,
                                           name="MAP 215-50")
        method = self.method("Ar/Ar " + info.pop("Type"))
        self.add(sample, target, instrument, method)
        self.db.session.flush()

        session = self.db.get_or_create(self.m.session,
                                        sample_id=sample.id,
                                        instrument=instrument.id,
                                        technique=method.id,
                                        date=mod_time,
                                        target=target.id)
        session.date_precision = "day"
        self.add(session)
        self.db.session.flush()

        info = self.general_info(session, info)
        session.data = info.to_dict()

        for i, step in enumerate(incremental_heating.iterrows()):
            self.import_heating_step(i, step, session, incremental_heating)

        # Import results table
        try:
            res = results.loc["Age Plateau"]
            self.import_age_plateau(session, res)
        except KeyError:
            pass

        res = results.loc["Total Fusion Age"]
        self.import_fusion_age(session, res)

        # This function returns the top-level
        # record that should be linked to the datafile
        self.db.session.flush()
        yield session
Пример #6
0
def get_excel_reader(infile):
    try:
        if isinstance(infile, IOBase):
            # We have an in-memory file
            fc = infile.read()
            return open_workbook(file_contents=fc, on_demand=True)
        else:
            # We have a filename string
            return open_workbook(infile, on_demand=True)
    except Exception as err:
        raise SparrowImportError(str(err))
def generalize_samples(input):
    """Generalize sample ids into `sample_name`, `analysis_name`,
       and `session_index` columns"""

    # Create sample name columns
    data = input.reset_index()
    data.rename(columns={'Analysis': 'analysis'}, inplace=True)

    # Strip out extra data
    data['analysis'] = data['analysis'].str.strip(delimiters)
    data['analysis_name'] = data['analysis'].apply(extract_analysis_name)
    # Strip the analysis suffix off of the sample ID
    data['sample_name'] = data.apply(strip_analysis_name, axis=1)

    for sample_name, group in data.groupby(["sample_name"]):
        unique_suffix = group['analysis_name'].unique()
        # If we don't have enough unique suffixes, it's probable that we actually
        # grabbed part of the sample ID. In that case, we fall back to the
        # original sample id
        ix = data['sample_name'] == sample_name
        if len(unique_suffix) / len(group) < 0.4:
            # Not many of our analysis names are unique, so we fall back
            # to dealing with samples without internal enumeration
            data.loc[ix, 'sample_name'] = data.loc[ix, 'analysis']
            data.loc[ix, 'analysis_name'] = None

        if sample_name.startswith('Spot'):
            # It appears we don't have a sample name, instead
            data.loc[ix, 'sample_name'] = None
            data.loc[ix, 'analysis_name'] = data.loc[ix, 'analysis']

    n_samples = len(data['sample_name'].unique())
    if n_samples > 0.3 * len(data) and n_samples > 20:
        # We have a lot of "unique" samples even though we tried to extract
        # sample IDs. We are probably doing something wrong.
        raise SparrowImportError("Too many unique samples; skipping import.")

    # Session index is extracted if an integer can be found easily
    cleaned_name = data['analysis_name'].str.replace("Spot",
                                                     "").str.strip(delimiters)
    data['session_index'] = to_numeric(cleaned_name,
                                       errors='coerce',
                                       downcast='integer')

    print_sample_info(data, verbose=True)

    return data.set_index(["sample_name", "analysis_name", "session_index"],
                          drop=True)
Пример #8
0
def check_matching_irradiation(sample, row):
    # Check that irradiation matches
    try:
        session = sample.session_collection[0]
    except IndexError:
        # Sample is not linked to any sessions, and that's OK!
        return

    irr = session.get_attribute("Irradiation ID")
    # Sessions should not have two Irradiation IDs
    assert len(irr) == 1
    v = irr[0].value
    v1 = str(row['Irradiation'])
    if not v.startswith(v1):
        raise SparrowImportError(f"Irradiation mismatch")
    print(f"  Irradiation {v}")