def test_resolve_path_indexerror(self, schema: SortedDict): data: DataFrame = make_dataframe(5, 4, data_gen_f=TestResolvePath.data_gen) with pytest.raises(IndexError): resolve_path(self.archive, schema, data)
def test_resolve_path_valueerror(self, schema): data = make_dataframe(5, 4, data_gen_f=TestResolvePath.data_gen_invalid) with pytest.raises(ValueError): resolve_path(self.archive, schema, data)
def test_resolve_path(self, schema: SortedDict, expected: str): data: DataFrame = make_dataframe(5, 4, data_gen_f=TestResolvePath.data_gen) actual: str = resolve_path(self.archive, schema, data) assert actual == expected
def test_resolve_path_normalized(self, schema, expected): data = make_dataframe( 5, 4, data_gen_f=TestResolvePath.data_gen_normalizable) actual = resolve_path(self.archive, schema, data) assert actual == expected
def archive(context: Context): """Store the files specified in the current context. Args: context (Context): Runtime settings object. Raises: FileExistsError: An archive file already exists with the same filepath. IndexError: Schema value is not a column header of a given DataFrame. OSError: File operation error. Error type raised may be a subclass of OSError. ParserError: Error raised by pandas.read_csv. ValueError: More than one unique metadata value exists under a column header. """ from glob import glob from os import makedirs from os.path import exists, join, split from pandas import concat, DataFrame, read_csv, Series from pandas.errors import EmptyDataError, ParserError from sortedcontainers import SortedList from syphon.schema import check_columns, resolve_path from . import datafilter from . import file_map from ._lockmanager import LockManager lock_manager = LockManager() lock_list = list() # add '#lock' file to all data directories data_list = SortedList(glob(context.data)) try: lock_list.append(lock_manager.lock(split(data_list[0])[0])) except OSError: raise # add '#lock' file to all metadata directories meta_list = SortedList() if context.meta is not None: meta_list = SortedList(glob(context.meta)) try: lock_list.append(lock_manager.lock(split(meta_list[0])[0])) except OSError: raise fmap = file_map(data_list, meta_list) for datafile in fmap: _, datafilename = split(datafile) data_frame = None try: # TODO: Issue #9 - 'open file' abstractions here data_frame = DataFrame(read_csv(datafile, dtype=str)) except EmptyDataError: # trigger the empty check below data_frame = DataFrame() except ParserError: raise if data_frame.empty: print('Skipping empty data file @ {}'.format(datafile)) continue # remove empty columns data_frame.dropna(axis=1, how='all', inplace=True) total_rows, _ = data_frame.shape # merge all metadata files into a single DataFrame meta_frame = None for metafile in fmap[datafile]: try: # TODO: Issue #9 - 'open file' abstractions here new_frame = DataFrame(read_csv(metafile, dtype=str)) except ParserError: raise new_frame.dropna(axis=1, how='all', inplace=True) for header in list(new_frame.columns.values): # complain if there's more than one value in a column if len(new_frame[header].drop_duplicates().values) > 1: raise ValueError( 'More than one value exists under the {} column.' .format(header)) if len(new_frame[header]) is total_rows: if meta_frame is None: meta_frame = new_frame[header] else: meta_frame = concat( [meta_frame, new_frame[header]], axis=1) else: meta_value = new_frame[header].iloc[0] series = Series([meta_value] * total_rows, name=header) if meta_frame is None: meta_frame = DataFrame(series) else: meta_frame = concat([meta_frame, series], axis=1) if meta_frame is not None: data_frame = concat([data_frame, meta_frame], axis=1) try: check_columns(context.schema, data_frame) except IndexError: raise filtered_data = None filtered_data = datafilter(context.schema, data_frame) if len(filtered_data) is 0: filtered_data = [data_frame] for data in filtered_data: path = None try: path = resolve_path(context.archive, context.schema, data) except IndexError: raise except ValueError: raise target_filename = join(path, datafilename) if exists(target_filename) and not context.overwrite: raise FileExistsError('Archive error: file already exists @ ' '{}'.format(target_filename)) try: makedirs(path, exist_ok=True) data.to_csv(target_filename, index=False) except OSError: raise while len(lock_list) > 0: lock = lock_list.pop() try: lock_manager.release(lock) except OSError: raise