Exemplo n.º 1
0
def test_loading_cleaned_spreadsheet(sandbox_configuration):
    sandbox_configuration.with_workspace('spreadsheets')

    df = default_dataset(match='already_cleaned', write=False)

    assert (sorted(list(
        df.columns)) == ['description', 'id', 'location', 'path'])
Exemplo n.º 2
0
def test_normalize_and_complete_data(sandbox_configuration):
    sandbox_configuration.with_workspace('spreadsheets')
    df = default_dataset(match='normalize_and_complete', write=False)

    assert (list(df['location']) == ['ALG-MC'] * 4 + ['BL7'] * 2)
    assert (list(df['hv']) == [5.93] + [4.2] * 5)

    assert (sorted(list(df.columns)) == ['hv', 'id', 'location', 'path'])
Exemplo n.º 3
0
def test_multiple_spreadsheets_throws_assertion_error(sandbox_configuration):
    sandbox_configuration.with_workspace('spreadsheets')

    with pytest.raises(AssertionError) as e:
        df = default_dataset()

    assert ('.xlsx' in str(e.value))
    assert (' == 1' in str(e.value))
Exemplo n.º 4
0
def test_whitespace_should_be_trimmed_from_columns_without_errors(
        sandbox_configuration):
    sandbox_configuration.with_workspace('spreadsheets')
    df = default_dataset(match='whitespace_in_columns', write=False)

    assert (len(df) == 6)
    assert (sorted(list(
        df.columns)) == ['description', 'id', 'location', 'path'])
Exemplo n.º 5
0
def test_skip_headers(sandbox_configuration):
    sandbox_configuration.with_workspace('spreadsheets')

    df = default_dataset(match='skip_headers', write=False)

    assert (sorted(list(df.columns)) == [
        'beta', 'description', 'id', 'location', 'path', 'phi', 'temperature',
        'theta'
    ])
Exemplo n.º 6
0
def test_user_is_warned_about_required_columns(sandbox_configuration):
    sandbox_configuration.with_workspace('spreadsheets')

    with pytest.warns(Warning) as w:
        with pytest.raises(ValueError) as v:
            df = default_dataset(match='user_warnings', write=False)

    assert ('You must supply both a `file` and a `location` column in your '
            'spreadsheet in order to load data.' in str(w[-1].message.args[0]))
    assert ('Could not safely read dataset.' in str(v.value))
    assert (
        'Did you supply both a `file` and a `location` column in your spreadsheet?'
        in str(v.value))
Exemplo n.º 7
0
def load_dataset(dataset_uuid=None, filename=None, df: pd.DataFrame = None):
    """
    You might want to prefer ``simple_load`` over calling this directly as it is more convenient.

    :param dataset_uuid: UUID of dataset to load, typically you get this from ds.loc['...'].id. This actually also
    accepts a dataframe slice so ds.loc['...'] also works.
    :param df: dataframe to use to lookup the data in. If none is provided, the result of default_dataset is used.
    :return:
    """
    if df is None:
        try:
            from arpes.utilities import default_dataset  # break circular dependency
            df = default_dataset()
        except Exception: # pylint: disable=broad-except
            pass

    if filename is None:
        filename = _filename_for(dataset_uuid)

    if not os.path.exists(filename):
        raise ValueError('%s is not cached on the FS. Did you run `prepare_raw_data`?')

    try:
        arr = xr.open_dataset(filename)
    except ValueError:
        arr = xr.open_dataarray(filename)
    arr = unwrap_datavar_attrs(arr)

    # If the sample is associated with a cleave, attach the information from that cleave
    if 'sample' in arr.attrs and 'cleave' in arr.attrs:
        full_cleave_name = '%s-%s' % (arr.attrs['sample'], arr.attrs['cleave'])

        with open(CLEAVE_RECORD, 'r') as file:
            cleaves = json.load(file)

        skip_keys = {'included_scans', 'note'}
        for k, v in cleaves.get(full_cleave_name, {}).items():
            if k not in skip_keys and k not in arr.attrs:
                arr.attrs[k] = v

    if 'ref_id' in arr.attrs:
        arr.attrs['ref_attrs'] = load_dataset_attrs(arr.attrs['ref_id'])

    for prop in FREEZE_PROPS:
        if prop not in arr.attrs:
            arr.attrs[prop] = getattr(arr.S, prop)

    arr.attrs['df'] = df

    return arr
Exemplo n.º 8
0
def make_reference_plots(df: pd.DataFrame = None, with_kspace=False):
    if df is None:
        df = default_dataset()

    try:
        df = df[df.spectrum_type != 'xps_spectrum']
    except TypeError:
        warnings.warn(
            'Unable to filter out XPS files, did you attach spectra type?')

    # Make scans indicating cut locations
    for index, row in df.iterrows():
        try:
            scan = simple_load(index)

            if isinstance(scan, xr.Dataset):
                # make plot series normalized by current:
                scan.S.reference_plot(out=True)
            else:
                scan.S.reference_plot(out=True, use_id=False)

                if scan.S.spectrum_type == 'spectrum':
                    # Also go and make a normalized version
                    normed = normalize_dim(scan, 'phi')
                    normed.S.reference_plot(out=True,
                                            use_id=False,
                                            pattern='{}_norm_phi.png')

                    if with_kspace:
                        kspace_converted = convert_scan_to_kspace(scan)
                        kspace_converted.S.reference_plot(out=True,
                                                          use_id=False,
                                                          pattern='k_{}.png')

                        normed_k = normalize_dim(kspace_converted, 'kp')
                        normed_k.S.reference_plot(out=True,
                                                  use_id=False,
                                                  pattern='k_{}_norm_kp.png')

        except Exception as e:
            print(str(e))
            warnings.warn('Cannot make plots for {}'.format(index))
Exemplo n.º 9
0
def simple_load(fragment, df: pd.DataFrame = None, workspace=None, basic_prep=True):
    with WorkspaceManager(workspace):
        if df is None:
            from arpes.utilities import default_dataset  # break circular dependency
            df = default_dataset()

        def resolve_fragment(filename):
            return str(filename).split('_')[-1]

        # find a soft match
        files = df.index

        def strip_left_zeros(value):
            if len(value) == 1:
                return value

            return value.lstrip('0')

        if isinstance(fragment, (int, np.int32, np.int64,)):
            numbers = [int(f) for f in [strip_left_zeros(''.join(c for c in resolve_fragment(f) if c.isdigit()))
                                        for f in files] if len(f)]
            index = numbers.index(fragment)
        else:
            fragment = str(fragment)
            matches = [i for i, f in enumerate(files) if fragment in f]
            if not matches:
                raise ValueError('No match found for {}'.format(fragment))
            if len(matches) > 1:
                raise ValueError('Unique match not found for {}. Options are: {}'.format(
                    fragment, [files[i] for i in matches]))
            index = matches[0]

        data = load_dataset(dataset_uuid=df.loc[df.index[index]], df=df)

        if basic_prep:
            if 'cycle' in data.indexes and len(data.coords['cycle']) == 1:
                data = data.sum('cycle', keep_attrs=True)

        return data
Exemplo n.º 10
0
def direct_load(fragment, df: pd.DataFrame = None, workspace=None, file=None, basic_prep=True, **kwargs):
    """
    Loads a dataset directly, in the same manner that prepare_raw_files does, from the denormalized source format.
    This is useful for testing data loading procedures, and for quickly opening data at beamlines.

    The structure of this is very similar to simple_load, and could be shared. The only differences are in selecting
    the DataFrame with all the files at the beginning, and finally loading the data at the end.
    :param fragment:
    :param df:
    :param workspace:
    :param file:
    :param basic_prep:
    :return:
    """

    with WorkspaceManager(workspace):
        # first get our hands on a dataframe that has a list of all the files, where to find them on disk, and metadata
        if df is None:
            arpes.config.attempt_determine_workspace(lazy=True)
            if file is None:
                from arpes.utilities import default_dataset  # break circular dependency
                df = default_dataset(with_inferred_cols=False)
            else:
                if not os.path.isabs(file):
                    file = os.path.join(CONFIG['WORKSPACE']['path'], file)

                df = clean_xlsx_dataset(file, with_inferred_cols=False, write=False)

        def resolve_fragment(filename: int) -> str:
            return str(filename).split('_')[-1]

        # find a soft match
        files = df.index

        def strip_left_zeros(value: str) -> str:
            if len(value) == 1:
                return value

            return value.lstrip('0')

        if isinstance(fragment, (int, np.int32, np.int64,)):
            numbers = [int(f) for f in [strip_left_zeros(''.join(c for c in resolve_fragment(f) if c.isdigit()))
                                        for f in files] if len(f)]
            index = numbers.index(fragment)
        else:
            fragment = str(fragment)
            matches = [i for i, f in enumerate(files) if fragment in f]
            if not matches:
                raise ValueError('No match found for {}'.format(fragment))
            if len(matches) > 1:
                raise ValueError('Unique match not found for {}. Options are: {}'.format(
                    fragment, [files[i] for i in matches]))
            index = matches[0]

        scan = df.loc[df.index[index]]
        data = load_scan(dict(scan), **kwargs)

        if basic_prep:
            if 'cycle' in data.indexes and len(data.coords['cycle']) == 1:
                data = data.sum('cycle', keep_attrs=True)

        return data
Exemplo n.º 11
0
def test_normalize_column_names(sandbox_configuration):
    sandbox_configuration.with_workspace('spreadsheets')
    df = default_dataset(match='normalize_column_names', write=False)

    assert (sorted(list(
        df.columns)) == ['hv', 'id', 'location', 'path', 'temperature'])