def get(self, local_path): bucket, path = parsePath(local_path) print(bucket, path) fs_client = fs.LocalFileSystem() file_info_list = fs_client.get_file_info( fs.FileSelector(path, recursive=False)) files = [] dirs = [] for info in file_info_list: if info.type.value == 2: # File type files.append({ 'name': info.base_name, 'ext': info.extension, 'size': info.size, 'mtime': info.mtime.isoformat() }) elif info.type.value == 3: # Directory type dirs.append({ 'name': info.base_name, 'mtime': info.mtime.isoformat() }) self.finish(json.dumps({'files': files, 'dirs': dirs}))
def update_object_timeseries(self, detect_time, matches, write=False): if self.odb is None: self.get_object_timeseries() odb = self.odb name = "Observer: " + self.name if matches is not None: for match in matches: logger.debug( "Observer: Match class: {match['class']}, score: {match['score']}, roi: {match['roi']}" ) # y1, x1, y2, x2 = boxes[i] odb.loc[detect_time] = [ match['class'], match['score'], match['roi'][1], match['roi'][0], match['roi'][3], match['roi'][2] # match['roi'] ] if write is True: logger.debug(f"{name} : odb is {odb}") parquet_file = os.path.join(self.mydir, self.name + '_events.parquet') if os.path.isfile(parquet_file): logger.debug(f"{name}:File exists, will overwrite") odbtable = pa.Table.from_pandas(odb) localfs = pfs.LocalFileSystem() with localfs.open_output_stream(parquet_file) as pfile: pq.write_table(odbtable, pfile) self.odb = odb
def test_parquet_writer_filesystem_buffer_raises(): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) filesystem = fs.LocalFileSystem() # Should raise ValueError when filesystem is passed with file-like object with pytest.raises(ValueError, match="specified path is file-like"): pq.ParquetWriter(pa.BufferOutputStream(), table.schema, filesystem=filesystem)
def test_open_dataset_filesystem(tempdir): # single file table, path = _create_single_file(tempdir) # filesystem inferred from path dataset1 = ds.dataset(str(path)) assert dataset1.schema.equals(table.schema) # filesystem specified dataset2 = ds.dataset(str(path), filesystem=fs.LocalFileSystem()) assert dataset2.schema.equals(table.schema) # local filesystem specified with relative path with change_cwd(tempdir): dataset3 = ds.dataset("test.parquet", filesystem=fs.LocalFileSystem()) assert dataset3.schema.equals(table.schema) # passing different filesystem with pytest.raises(FileNotFoundError): ds.dataset(str(path), filesystem=fs._MockFileSystem())
def __init__(self, storage_client: Client): self.period_per_file = timedelta(minutes=1) self.times = [] self.amps = [] self.phases = [] self.client = storage_client self.local = fs.LocalFileSystem() self.log = logging.getLogger('Buffer')
def test_construct_from_single_directory(tempdir): directory = tempdir / 'single-directory' directory.mkdir() tables, paths = _create_directory_of_files(directory) d1 = ds.dataset(directory) d2 = ds.dataset(directory, filesystem=fs.LocalFileSystem()) d3 = ds.dataset(directory.name, filesystem=_filesystem_uri(tempdir)) t1 = d1.to_table() t2 = d2.to_table() t3 = d3.to_table() assert t1 == t2 == t3
def test_construct_from_single_file(tempdir): directory = tempdir / 'single-file' directory.mkdir() table, path = _create_single_file(directory) relative_path = path.relative_to(directory) # instantiate from a single file d1 = ds.dataset(path) # instantiate from a single file with a filesystem object d2 = ds.dataset(path, filesystem=fs.LocalFileSystem()) # instantiate from a single file with prefixed filesystem URI d3 = ds.dataset(relative_path, filesystem=_filesystem_uri(directory)) assert d1.to_table() == d2.to_table() == d3.to_table()
def test_open_dataset_filesystem(tempdir): # # single file table, path = _create_single_file(tempdir) # filesystem inferred from path dataset1 = ds.dataset(str(path)) assert dataset1.schema.equals(table.schema, check_metadata=False) # filesystem specified dataset2 = ds.dataset(str(path), filesystem=fs.LocalFileSystem()) assert dataset2.schema.equals(table.schema, check_metadata=False) # passing different filesystem with pytest.raises(FileNotFoundError): ds.dataset(str(path), filesystem=fs._MockFileSystem())
def test_filesystem_uri(tmpdir): from pyarrow import orc table = pa.table({"a": [1, 2, 3]}) directory = tmpdir / "data_dir" directory.mkdir() path = directory / "data.orc" orc.write_table(table, str(path)) # filesystem object result = orc.read_table(path, filesystem=fs.LocalFileSystem()) assert result.equals(table) # filesystem URI result = orc.read_table("data_dir/data.orc", filesystem=util._filesystem_uri(tmpdir)) assert result.equals(table)
def __init__( self, address=None, name: str = None, namespace: str = None, partitions: int = 1, storage_type: LocalFSStoreType = LocalFSStoreType.DISK, options=None, ): super(StorageTable, self).__init__( name=name, namespace=namespace, address=address, partitions=partitions, options=options, engine=StorageEngine.LOCALFS, store_type=storage_type, ) self._local_fs_client = fs.LocalFileSystem()
def test_construct_from_list_of_files(tempdir): # instantiate from a list of files directory = tempdir / 'list-of-files' directory.mkdir() tables, paths = _create_directory_of_files(directory) relative_paths = [p.relative_to(tempdir) for p in paths] with change_cwd(tempdir): d1 = ds.dataset(relative_paths) t1 = d1.to_table() assert len(t1) == sum(map(len, tables)) d2 = ds.dataset(relative_paths, filesystem=_filesystem_uri(tempdir)) t2 = d2.to_table() d3 = ds.dataset(paths) t3 = d3.to_table() d4 = ds.dataset(paths, filesystem=fs.LocalFileSystem()) t4 = d4.to_table() assert t1 == t2 == t3 == t4
def read_csv_write_to_parquet(local_data_path, s3_path, local_meta_path): if s3_path.startswith("s3://"): s3_path = s3_path.replace("s3://", "", 1) local = fs.LocalFileSystem() s3 = fs.S3FileSystem(region=REGION) with local.open_input_stream(local_data_path) as f: tab = csv.read_csv(f) metadata = read_table_json(local_meta_path) arrow_cols = [] for col in metadata.columns: if col["name"] not in metadata.partitions: arrow_cols.append(convert_meta_col_to_arrow_tuple(col)) s = pa.schema(arrow_cols) tab = tab.cast(s) with s3.open_output_stream(s3_path) as f: pq.write_table(tab, f)
def update_video_database(self, start_time, vfile_name, end_time, event): if self.vfdb is None: self.get_video_database() name = "Writer: " + self.config.name vfdb = self.vfdb deleted = False try: space_used = os.path.getsize(vfile_name) vfdb.loc[start_time] = [ vfile_name, end_time, event, deleted, space_used ] except FileNotFoundError: logger.warning(f"{name}: File missing, zero frames maybe?") logger.debug( f"{name} : ===============================================") logger.debug( f"{name} : start_time: {start_time}, vfile_name : {vfile_name}, end_time: {end_time}, event: {event}, deleted:{deleted}" ) logger.debug( f"{name} : ===============================================") logger.debug(f"{name} : vfdb entries: {self.vfdb}") logger.debug( f"{name} : ===============================================") parquet_file = os.path.join(self.config.mydir, self.config.name + '_videodb.parquet') if os.path.isfile(parquet_file): logger.debug(f"{name}:File exists, will overwrite") vfdbtable = pa.Table.from_pandas(vfdb) localfs = pfs.LocalFileSystem() with localfs.open_output_stream(parquet_file) as pfile: pq.write_table(vfdbtable, pfile) self.vfdb = vfdb
fs_protocol_obj = util.FSProtocolClass(path) result = _read_table( fs_protocol_obj, use_legacy_dataset=use_legacy_dataset ) assert result.equals(table) # combined with non-local filesystem raises with pytest.raises(TypeError): _read_table(fs_protocol_obj, filesystem=FileSystem()) @pytest.mark.dataset @parametrize_legacy_dataset @pytest.mark.parametrize("filesystem", [ None, fs.LocalFileSystem(), LocalFileSystem._get_instance() ]) def test_relative_paths(tempdir, use_legacy_dataset, filesystem): # reading and writing from relative paths table = pa.table({"a": [1, 2, 3]}) # reading pq.write_table(table, str(tempdir / "data.parquet")) with util.change_cwd(tempdir): result = pq.read_table("data.parquet", filesystem=filesystem, use_legacy_dataset=use_legacy_dataset) assert result.equals(table) # writing with util.change_cwd(tempdir): pq.write_table(table, "data2.parquet", filesystem=filesystem)
fs_protocol_obj = util.FSProtocolClass(path) result = _read_table(fs_protocol_obj, use_legacy_dataset=use_legacy_dataset) assert result.equals(table) # combined with non-local filesystem raises with pytest.raises(TypeError): _read_table(fs_protocol_obj, filesystem=FileSystem()) @pytest.mark.dataset @parametrize_legacy_dataset @pytest.mark.parametrize( "filesystem", [None, fs.LocalFileSystem(), LocalFileSystem._get_instance()]) def test_relative_paths(tempdir, use_legacy_dataset, filesystem): # reading and writing from relative paths table = pa.table({"a": [1, 2, 3]}) # reading pq.write_table(table, str(tempdir / "data.parquet")) with util.change_cwd(tempdir): result = pq.read_table("data.parquet", filesystem=filesystem, use_legacy_dataset=use_legacy_dataset) assert result.equals(table) # writing with util.change_cwd(tempdir):
def prune_video_database(self): if self.vfdb is None: self.get_video_database() name = "Writer: " + self.config.name vfdb = self.vfdb parquet_file = os.path.join(self.config.mydir, self.config.name + '_videodb.parquet') if not os.path.isfile(parquet_file): return now = datetime.datetime.now() total, used, free = shutil.disk_usage(parquet_file) percentage_free = int((free * 100) / total) logger.warning( f"{name}: Disk free = {percentage_free}%, total: {int(total/(1024*1024))} MB, used: {int(used/(1024*1024))} MB, free: {int(free/(1024*1024))} MB" ) # TODO: Get threshold from configuration file passnum = 0 #if percentage_free <= self.config.deletethreshold: while percentage_free <= self.config.deletethreshold: #prune_older_than_hours = datetime.timedelta(days=self.config.deleteafterdays, deleteafterdays = self.config.deleteafterdays - passnum prune_older_than_hours = datetime.timedelta( days=deleteafterdays, hours=now.hour, minutes=now.minute, seconds=now.second, microseconds=now.microsecond) prune_from = now - datetime.timedelta(days=1 * 365, hours=now.hour, minutes=now.minute, seconds=now.second, microseconds=now.microsecond) prune_to = now - prune_older_than_hours logger.warning( f"{name}: Pruning from {prune_from} to {prune_to}, pass:{passnum}" ) passnum = passnum + 1 #logger.warning(f"Starting vfdb is {vfdb}") vfdb_prune = vfdb[prune_from:prune_to] # Do not delete files with events if freespace greater than events threshold if percentage_free > self.config.deleteeventsthreshold: vfdb_prune = vfdb_prune[vfdb_prune['event'] == False].sort_index() files_to_prune = vfdb_prune[['vfile_name']] total_freed = 0 for file_tobe_deleted in files_to_prune['vfile_name']: try: space_freed = os.path.getsize(file_tobe_deleted) total_freed += space_freed logger.warning( f"{name}: Deleting file: {file_tobe_deleted}, freed: {int(space_freed/(1024*1024))} MB" ) os.remove(file_tobe_deleted) except FileNotFoundError: logger.debug( f"{name}: File {file_tobe_deleted} not found, might have been manually deleted?" ) logger.warning( f"{name}: Total Freed : {int(total_freed/(1024*1024))} MB") #vfdb_prune['deleted'].loc[vfdb_prune['deleted'] == False] = True vfdb.update(vfdb_prune) #logger.warning(f"vfdb: index is {vfdb.loc[vfdb_prune.index]['deleted']}") logger.debug(f"{name} : vfdb entries after update: {vfdb}") total, used, free = shutil.disk_usage(parquet_file) percentage_free = int((free * 100) / total) logger.warning( f"{name}: After pass: {passnum}, Disk free = {percentage_free}%, total: {int(total/(1024*1024))} MB, used: {int(used/(1024*1024))} MB, free: {int(free/(1024*1024))} MB" ) if os.path.isfile(parquet_file): logger.warning(f"{name}:File exists, will overwrite") vfdbtable = pa.Table.from_pandas(vfdb) localfs = pfs.LocalFileSystem() with localfs.open_output_stream(parquet_file) as pfile: pq.write_table(vfdbtable, pfile) #logger.warning(f"Updated vfdb is {vfdb}") self.vfdb = vfdb # Don't spin if we cannot even clear up enough after going to 1 # day retention if passnum >= self.config.deleteafterdays: break
except Exception as e: assert str(e) == error_text buf = out.getvalue() result = _read_table(pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @pytest.mark.pandas @pytest.mark.parametrize("filesystem", [ None, LocalFileSystem._get_instance(), fs.LocalFileSystem(), ]) def test_parquet_writer_filesystem_local(tempdir, filesystem): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') with pq.ParquetWriter(path, table.schema, filesystem=filesystem, version='2.0') as writer: writer.write_table(table) result = _read_table(path).to_pandas() tm.assert_frame_equal(result, df)
def parse_single_path(path: str): if Path(path).exists(): return fs.LocalFileSystem(), path else: return fs.FileSystem.from_uri(path)
table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"]) # truncate the title after 101 characters (matching display logic) truncated_title = pc.utf8_replace_slice(table.column("title"), start=101, stop=1000, replacement="") table = table.set_column(table.schema.get_field_index("title"), "title", truncated_title) # ensure all dictionaries in the file use the same key/value mappings table = table.unify_dictionaries() # filter out non-numeric dates (e.g. null, "1850-1853") # matches the hack in index.js:37 mask = pc.invert(pc.is_null(table.column("date"))) table = table.filter(mask) # sorting by the date improves the loading aesthetics # comment this out to exactly match the original appearance indices = pc.sort_indices(table, sort_keys=[("date", "ascending")]) table = pc.take(table, indices) # after sorting replace ix with an accurate row index indices = pc.sort_indices(table, sort_keys=[("date", "ascending")]) table = table.set_column(table.schema.get_field_index("ix"), "ix", pc.cast(indices, pa.uint32())) temp_path.unlink() local = fs.LocalFileSystem() with local.open_output_stream(str(target_path)) as file: with pa.RecordBatchStreamWriter(file, table.schema) as writer: writer.write_table(table, 10000)
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict): """ Reads in the data from the given filepath and returns a dataframe """ meta_col_names = [ c["name"] for c in metadata["columns"] if c["name"] not in metadata.get("partitions", []) ] # For string based file types convert make arrow readers read them in as strings # validators will still treat these as dates but will run validation against strings # cols expecting values to match a timestamp format if "json" in metadata["file_format"] or "csv" in metadata["file_format"]: md_obj = Metadata.from_dict(metadata) cols = md_obj.columns cols_to_force_str_read_in = [] for c in cols: if c["type"].startswith("time") or c["type"].startswith("date"): c["type"] = "string" c["type_category"] = "string" cols_to_force_str_read_in.append(c["name"]) md_obj.columns = cols ac = ArrowConverter() arrow_schema = ac.generate_from_meta(md_obj) ts_as_str_schema = pa.schema([]) for cname in cols_to_force_str_read_in: ts_as_str_schema = ts_as_str_schema.append( arrow_schema.field(cname)) # Set the reader type if filepath.startswith("s3://"): reader_fs = fs.S3FileSystem(region="eu-west-1") fp_for_file_reader = filepath.replace("s3://", "", 1) else: reader_fs = fs.LocalFileSystem() fp_for_file_reader = filepath with reader_fs.open_input_stream(fp_for_file_reader) as f: if "csv" in metadata["file_format"]: # Safer CSV load for newlines_in_values set to True if table_params.get("expect-header", True): po = csv.ParseOptions(newlines_in_values=True) else: po = csv.ParseOptions(newlines_in_values=True, column_names=meta_col_names) if ts_as_str_schema: co = csv.ConvertOptions(column_types=ts_as_str_schema) else: co = None df = pa_read_csv_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, convert_options=co, ) # dates/datetimes == string elif "json" in metadata["file_format"]: po = json.ParseOptions( newlines_in_values=True, explicit_schema=ts_as_str_schema if ts_as_str_schema else None, ) df = pa_read_json_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, ) # dates/datetimes == string elif "parquet" in metadata["file_format"]: df = arrow_to_pandas(pq.read_table(f)) # dates/datetimes == datetime / date else: raise ValueError( f"Unknown file_format in metadata: {metadata['file_format']}.") if table_params.get("row-limit"): df = df.sample(table_params.get("row-limit")) if table_params.get("headers-ignore-case"): df_cols = [c.lower() for c in df.columns] df.columns = df_cols if table_params.get("only-test-cols-in-metadata", False): keep_cols = [c for c in df.columns if c in meta_col_names] df = df[keep_cols] return df