Exemplo n.º 1
0
    def get(self, local_path):
        bucket, path = parsePath(local_path)
        print(bucket, path)
        fs_client = fs.LocalFileSystem()
        file_info_list = fs_client.get_file_info(
            fs.FileSelector(path, recursive=False))

        files = []
        dirs = []
        for info in file_info_list:
            if info.type.value == 2:
                # File type
                files.append({
                    'name': info.base_name,
                    'ext': info.extension,
                    'size': info.size,
                    'mtime': info.mtime.isoformat()
                })
            elif info.type.value == 3:
                # Directory type
                dirs.append({
                    'name': info.base_name,
                    'mtime': info.mtime.isoformat()
                })

        self.finish(json.dumps({'files': files, 'dirs': dirs}))
Exemplo n.º 2
0
    def update_object_timeseries(self, detect_time, matches, write=False):
        if self.odb is None:
            self.get_object_timeseries()
        odb = self.odb
        name = "Observer: " + self.name

        if matches is not None:
            for match in matches:
                logger.debug(
                    "Observer: Match class: {match['class']}, score: {match['score']}, roi: {match['roi']}"
                )
                # y1, x1, y2, x2 = boxes[i]
                odb.loc[detect_time] = [
                    match['class'], match['score'], match['roi'][1],
                    match['roi'][0], match['roi'][3], match['roi'][2]
                    # match['roi']
                ]

        if write is True:
            logger.debug(f"{name} : odb is {odb}")
            parquet_file = os.path.join(self.mydir,
                                        self.name + '_events.parquet')
            if os.path.isfile(parquet_file):
                logger.debug(f"{name}:File exists, will overwrite")
            odbtable = pa.Table.from_pandas(odb)
            localfs = pfs.LocalFileSystem()
            with localfs.open_output_stream(parquet_file) as pfile:
                pq.write_table(odbtable, pfile)

        self.odb = odb
Exemplo n.º 3
0
def test_parquet_writer_filesystem_buffer_raises():
    df = _test_dataframe(100)
    table = pa.Table.from_pandas(df, preserve_index=False)
    filesystem = fs.LocalFileSystem()

    # Should raise ValueError when filesystem is passed with file-like object
    with pytest.raises(ValueError, match="specified path is file-like"):
        pq.ParquetWriter(pa.BufferOutputStream(),
                         table.schema,
                         filesystem=filesystem)
Exemplo n.º 4
0
def test_open_dataset_filesystem(tempdir):
    # single file
    table, path = _create_single_file(tempdir)

    # filesystem inferred from path
    dataset1 = ds.dataset(str(path))
    assert dataset1.schema.equals(table.schema)

    # filesystem specified
    dataset2 = ds.dataset(str(path), filesystem=fs.LocalFileSystem())
    assert dataset2.schema.equals(table.schema)

    # local filesystem specified with relative path
    with change_cwd(tempdir):
        dataset3 = ds.dataset("test.parquet", filesystem=fs.LocalFileSystem())
    assert dataset3.schema.equals(table.schema)

    # passing different filesystem
    with pytest.raises(FileNotFoundError):
        ds.dataset(str(path), filesystem=fs._MockFileSystem())
Exemplo n.º 5
0
    def __init__(self, storage_client: Client):

        self.period_per_file = timedelta(minutes=1)

        self.times = []
        self.amps = []
        self.phases = []

        self.client = storage_client
        self.local = fs.LocalFileSystem()
        self.log = logging.getLogger('Buffer')
Exemplo n.º 6
0
def test_construct_from_single_directory(tempdir):
    directory = tempdir / 'single-directory'
    directory.mkdir()
    tables, paths = _create_directory_of_files(directory)

    d1 = ds.dataset(directory)
    d2 = ds.dataset(directory, filesystem=fs.LocalFileSystem())
    d3 = ds.dataset(directory.name, filesystem=_filesystem_uri(tempdir))
    t1 = d1.to_table()
    t2 = d2.to_table()
    t3 = d3.to_table()
    assert t1 == t2 == t3
Exemplo n.º 7
0
def test_construct_from_single_file(tempdir):
    directory = tempdir / 'single-file'
    directory.mkdir()
    table, path = _create_single_file(directory)
    relative_path = path.relative_to(directory)

    # instantiate from a single file
    d1 = ds.dataset(path)
    # instantiate from a single file with a filesystem object
    d2 = ds.dataset(path, filesystem=fs.LocalFileSystem())
    # instantiate from a single file with prefixed filesystem URI
    d3 = ds.dataset(relative_path, filesystem=_filesystem_uri(directory))
    assert d1.to_table() == d2.to_table() == d3.to_table()
Exemplo n.º 8
0
def test_open_dataset_filesystem(tempdir):
    # # single file
    table, path = _create_single_file(tempdir)

    # filesystem inferred from path
    dataset1 = ds.dataset(str(path))
    assert dataset1.schema.equals(table.schema, check_metadata=False)

    # filesystem specified
    dataset2 = ds.dataset(str(path), filesystem=fs.LocalFileSystem())
    assert dataset2.schema.equals(table.schema, check_metadata=False)

    # passing different filesystem
    with pytest.raises(FileNotFoundError):
        ds.dataset(str(path), filesystem=fs._MockFileSystem())
Exemplo n.º 9
0
def test_filesystem_uri(tmpdir):
    from pyarrow import orc
    table = pa.table({"a": [1, 2, 3]})

    directory = tmpdir / "data_dir"
    directory.mkdir()
    path = directory / "data.orc"
    orc.write_table(table, str(path))

    # filesystem object
    result = orc.read_table(path, filesystem=fs.LocalFileSystem())
    assert result.equals(table)

    # filesystem URI
    result = orc.read_table("data_dir/data.orc",
                            filesystem=util._filesystem_uri(tmpdir))
    assert result.equals(table)
Exemplo n.º 10
0
 def __init__(
     self,
     address=None,
     name: str = None,
     namespace: str = None,
     partitions: int = 1,
     storage_type: LocalFSStoreType = LocalFSStoreType.DISK,
     options=None,
 ):
     super(StorageTable, self).__init__(
         name=name,
         namespace=namespace,
         address=address,
         partitions=partitions,
         options=options,
         engine=StorageEngine.LOCALFS,
         store_type=storage_type,
     )
     self._local_fs_client = fs.LocalFileSystem()
Exemplo n.º 11
0
def test_construct_from_list_of_files(tempdir):
    # instantiate from a list of files
    directory = tempdir / 'list-of-files'
    directory.mkdir()
    tables, paths = _create_directory_of_files(directory)

    relative_paths = [p.relative_to(tempdir) for p in paths]
    with change_cwd(tempdir):
        d1 = ds.dataset(relative_paths)
        t1 = d1.to_table()
        assert len(t1) == sum(map(len, tables))

    d2 = ds.dataset(relative_paths, filesystem=_filesystem_uri(tempdir))
    t2 = d2.to_table()
    d3 = ds.dataset(paths)
    t3 = d3.to_table()
    d4 = ds.dataset(paths, filesystem=fs.LocalFileSystem())
    t4 = d4.to_table()

    assert t1 == t2 == t3 == t4
def read_csv_write_to_parquet(local_data_path, s3_path, local_meta_path):

    if s3_path.startswith("s3://"):
        s3_path = s3_path.replace("s3://", "", 1)

    local = fs.LocalFileSystem()
    s3 = fs.S3FileSystem(region=REGION)
    with local.open_input_stream(local_data_path) as f:
        tab = csv.read_csv(f)

    metadata = read_table_json(local_meta_path)
    arrow_cols = []
    for col in metadata.columns:
        if col["name"] not in metadata.partitions:
            arrow_cols.append(convert_meta_col_to_arrow_tuple(col))

    s = pa.schema(arrow_cols)
    tab = tab.cast(s)

    with s3.open_output_stream(s3_path) as f:
        pq.write_table(tab, f)
Exemplo n.º 13
0
    def update_video_database(self, start_time, vfile_name, end_time, event):
        if self.vfdb is None:
            self.get_video_database()

        name = "Writer: " + self.config.name
        vfdb = self.vfdb
        deleted = False

        try:
            space_used = os.path.getsize(vfile_name)
            vfdb.loc[start_time] = [
                vfile_name, end_time, event, deleted, space_used
            ]
        except FileNotFoundError:
            logger.warning(f"{name}: File missing, zero frames maybe?")

        logger.debug(
            f"{name} : ===============================================")
        logger.debug(
            f"{name} : start_time: {start_time}, vfile_name : {vfile_name}, end_time: {end_time}, event: {event}, deleted:{deleted}"
        )
        logger.debug(
            f"{name} : ===============================================")
        logger.debug(f"{name} : vfdb entries: {self.vfdb}")
        logger.debug(
            f"{name} : ===============================================")

        parquet_file = os.path.join(self.config.mydir,
                                    self.config.name + '_videodb.parquet')
        if os.path.isfile(parquet_file):
            logger.debug(f"{name}:File exists, will overwrite")
        vfdbtable = pa.Table.from_pandas(vfdb)

        localfs = pfs.LocalFileSystem()
        with localfs.open_output_stream(parquet_file) as pfile:
            pq.write_table(vfdbtable, pfile)

        self.vfdb = vfdb
Exemplo n.º 14
0
    fs_protocol_obj = util.FSProtocolClass(path)

    result = _read_table(
        fs_protocol_obj, use_legacy_dataset=use_legacy_dataset
    )
    assert result.equals(table)

    # combined with non-local filesystem raises
    with pytest.raises(TypeError):
        _read_table(fs_protocol_obj, filesystem=FileSystem())


@pytest.mark.dataset
@parametrize_legacy_dataset
@pytest.mark.parametrize("filesystem", [
    None, fs.LocalFileSystem(), LocalFileSystem._get_instance()
])
def test_relative_paths(tempdir, use_legacy_dataset, filesystem):
    # reading and writing from relative paths
    table = pa.table({"a": [1, 2, 3]})

    # reading
    pq.write_table(table, str(tempdir / "data.parquet"))
    with util.change_cwd(tempdir):
        result = pq.read_table("data.parquet", filesystem=filesystem,
                               use_legacy_dataset=use_legacy_dataset)
    assert result.equals(table)

    # writing
    with util.change_cwd(tempdir):
        pq.write_table(table, "data2.parquet", filesystem=filesystem)
Exemplo n.º 15
0
    fs_protocol_obj = util.FSProtocolClass(path)

    result = _read_table(fs_protocol_obj,
                         use_legacy_dataset=use_legacy_dataset)
    assert result.equals(table)

    # combined with non-local filesystem raises
    with pytest.raises(TypeError):
        _read_table(fs_protocol_obj, filesystem=FileSystem())


@pytest.mark.dataset
@parametrize_legacy_dataset
@pytest.mark.parametrize(
    "filesystem",
    [None, fs.LocalFileSystem(),
     LocalFileSystem._get_instance()])
def test_relative_paths(tempdir, use_legacy_dataset, filesystem):
    # reading and writing from relative paths
    table = pa.table({"a": [1, 2, 3]})

    # reading
    pq.write_table(table, str(tempdir / "data.parquet"))
    with util.change_cwd(tempdir):
        result = pq.read_table("data.parquet",
                               filesystem=filesystem,
                               use_legacy_dataset=use_legacy_dataset)
    assert result.equals(table)

    # writing
    with util.change_cwd(tempdir):
Exemplo n.º 16
0
    def prune_video_database(self):
        if self.vfdb is None:
            self.get_video_database()
        name = "Writer: " + self.config.name

        vfdb = self.vfdb

        parquet_file = os.path.join(self.config.mydir,
                                    self.config.name + '_videodb.parquet')
        if not os.path.isfile(parquet_file):
            return

        now = datetime.datetime.now()

        total, used, free = shutil.disk_usage(parquet_file)
        percentage_free = int((free * 100) / total)
        logger.warning(
            f"{name}: Disk free = {percentage_free}%, total: {int(total/(1024*1024))} MB, used: {int(used/(1024*1024))} MB, free: {int(free/(1024*1024))} MB"
        )

        # TODO: Get threshold from configuration file
        passnum = 0
        #if percentage_free <= self.config.deletethreshold:
        while percentage_free <= self.config.deletethreshold:
            #prune_older_than_hours = datetime.timedelta(days=self.config.deleteafterdays,
            deleteafterdays = self.config.deleteafterdays - passnum
            prune_older_than_hours = datetime.timedelta(
                days=deleteafterdays,
                hours=now.hour,
                minutes=now.minute,
                seconds=now.second,
                microseconds=now.microsecond)
            prune_from = now - datetime.timedelta(days=1 * 365,
                                                  hours=now.hour,
                                                  minutes=now.minute,
                                                  seconds=now.second,
                                                  microseconds=now.microsecond)
            prune_to = now - prune_older_than_hours
            logger.warning(
                f"{name}: Pruning from {prune_from} to {prune_to}, pass:{passnum}"
            )

            passnum = passnum + 1

            #logger.warning(f"Starting vfdb is {vfdb}")
            vfdb_prune = vfdb[prune_from:prune_to]
            # Do not delete files with events if freespace greater than events threshold
            if percentage_free > self.config.deleteeventsthreshold:
                vfdb_prune = vfdb_prune[vfdb_prune['event'] ==
                                        False].sort_index()
            files_to_prune = vfdb_prune[['vfile_name']]

            total_freed = 0
            for file_tobe_deleted in files_to_prune['vfile_name']:
                try:
                    space_freed = os.path.getsize(file_tobe_deleted)
                    total_freed += space_freed
                    logger.warning(
                        f"{name}: Deleting file: {file_tobe_deleted}, freed: {int(space_freed/(1024*1024))} MB"
                    )
                    os.remove(file_tobe_deleted)
                except FileNotFoundError:
                    logger.debug(
                        f"{name}: File {file_tobe_deleted} not found, might have been manually deleted?"
                    )

            logger.warning(
                f"{name}: Total Freed : {int(total_freed/(1024*1024))} MB")

            #vfdb_prune['deleted'].loc[vfdb_prune['deleted'] == False] = True
            vfdb.update(vfdb_prune)
            #logger.warning(f"vfdb: index is {vfdb.loc[vfdb_prune.index]['deleted']}")
            logger.debug(f"{name} : vfdb entries after update: {vfdb}")

            total, used, free = shutil.disk_usage(parquet_file)
            percentage_free = int((free * 100) / total)
            logger.warning(
                f"{name}: After pass: {passnum}, Disk free = {percentage_free}%, total: {int(total/(1024*1024))} MB, used: {int(used/(1024*1024))} MB, free: {int(free/(1024*1024))} MB"
            )

            if os.path.isfile(parquet_file):
                logger.warning(f"{name}:File exists, will overwrite")

            vfdbtable = pa.Table.from_pandas(vfdb)
            localfs = pfs.LocalFileSystem()
            with localfs.open_output_stream(parquet_file) as pfile:
                pq.write_table(vfdbtable, pfile)
            #logger.warning(f"Updated vfdb is {vfdb}")

            self.vfdb = vfdb

            # Don't spin if we cannot even clear up enough after going to 1
            # day retention
            if passnum >= self.config.deleteafterdays:
                break
Exemplo n.º 17
0
    except Exception as e:
        assert str(e) == error_text

    buf = out.getvalue()
    result = _read_table(pa.BufferReader(buf),
                         use_legacy_dataset=use_legacy_dataset)

    expected = pd.concat(frames, ignore_index=True)
    tm.assert_frame_equal(result.to_pandas(), expected)


@pytest.mark.pandas
@pytest.mark.parametrize("filesystem", [
    None,
    LocalFileSystem._get_instance(),
    fs.LocalFileSystem(),
])
def test_parquet_writer_filesystem_local(tempdir, filesystem):
    df = _test_dataframe(100)
    table = pa.Table.from_pandas(df, preserve_index=False)
    path = str(tempdir / 'data.parquet')

    with pq.ParquetWriter(path,
                          table.schema,
                          filesystem=filesystem,
                          version='2.0') as writer:
        writer.write_table(table)

    result = _read_table(path).to_pandas()
    tm.assert_frame_equal(result, df)
Exemplo n.º 18
0
 def parse_single_path(path: str):
     if Path(path).exists():
         return fs.LocalFileSystem(), path
     else:
         return fs.FileSystem.from_uri(path)
Exemplo n.º 19
0
table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"])

# truncate the title after 101 characters (matching display logic)
truncated_title = pc.utf8_replace_slice(table.column("title"), start=101, stop=1000, replacement="")
table = table.set_column(table.schema.get_field_index("title"), "title", truncated_title)

# ensure all dictionaries in the file use the same key/value mappings
table = table.unify_dictionaries()

# filter out non-numeric dates (e.g. null, "1850-1853")
# matches the hack in index.js:37
mask = pc.invert(pc.is_null(table.column("date")))
table = table.filter(mask)

# sorting by the date improves the loading aesthetics
# comment this out to exactly match the original appearance
indices = pc.sort_indices(table, sort_keys=[("date", "ascending")])
table = pc.take(table, indices)

# after sorting replace ix with an accurate row index
indices = pc.sort_indices(table, sort_keys=[("date", "ascending")])
table = table.set_column(table.schema.get_field_index("ix"), "ix", pc.cast(indices, pa.uint32()))

temp_path.unlink()

local = fs.LocalFileSystem()

with local.open_output_stream(str(target_path)) as file:
    with pa.RecordBatchStreamWriter(file, table.schema) as writer:
        writer.write_table(table, 10000)
Exemplo n.º 20
0
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df