Пример #1
0
    def upload_file(self, filename):
        """Create a new entry from a given local file. Will make a copy of the
        given file.

        Raises ValueError if the given file does not exist.

        Parameters
        ----------
        filename: string
            Path to file on disk

        Returns
        -------
        vizier.filestore.base.FileHandle
        """
        # Ensure that the given file exists
        if not os.path.isfile(filename):
            raise ValueError('invalid file path \'' + str(filename) + '\'')
        name = name = os.path.basename(filename)
        # Create a new unique identifier for the file.
        identifier = get_unique_identifier()
        file_dir = self.get_file_dir(identifier, create=True)
        output_file = os.path.join(file_dir, DATA_FILENAME)
        # Copy the uploaded file
        shutil.copyfile(filename, output_file)
        # Add file to file index
        f_handle = FileHandle(
            identifier,
            filepath=output_file,
            file_name=name
        )
        # Write metadata file
        write_metadata_file(file_dir, f_handle)
        return f_handle
Пример #2
0
    def upload_stream(self, file, file_name):
        """Create a new entry from a given file stream. Will copy the given
        file to a file in the base directory.

        Parameters
        ----------
        file: werkzeug.datastructures.FileStorage
            File object (e.g., uploaded via HTTP request)
        file_name: string
            Name of the file

        Returns
        -------
        vizier.filestore.base.FileHandle
        """
        # Create a new unique identifier for the file.
        identifier = get_unique_identifier()
        file_dir = self.get_file_dir(identifier, create=True)
        output_file = os.path.join(file_dir, DATA_FILENAME)
        # Save the file object to the new file path
        file.save(output_file)
        f_handle = FileHandle(
            identifier,
            filepath=output_file,
            file_name=file_name
        )
        # Write metadata file
        write_metadata_file(file_dir, f_handle)
        return f_handle
Пример #3
0
    def download_file(self, url, username=None, password=None):
        """Create a local copy of the identified web resource.

        Parameters
        ----------
        url : string
            Unique resource identifier for external resource that is accessed
        username: string, optional
            Optional user name for authentication
        password: string, optional
            Optional password for authentication

        Returns
        -------
        vizier.filestore.base.FileHandle
        """
        # Get unique identifier and output file
        identifier = get_unique_identifier()
        file_dir = self.get_file_dir(identifier, create=True)
        output_file = os.path.join(file_dir, DATA_FILENAME)
        # Write web resource to output file.
        response = urllib.request.urlopen(url)
        filename = get_download_filename(url, response.info())
        mode = 'w'
        if filename.endswith('.gz'):
            mode += 'b'
        with open(output_file, mode) as f:
            f.write(response.read())
        # Add file to file index
        f_handle = FileHandle(identifier,
                              filepath=output_file,
                              file_name=filename)
        # Write metadata file
        write_metadata_file(file_dir, f_handle)
        return f_handle
Пример #4
0
    def download_dataset(self,
                         url,
                         username=None,
                         password=None,
                         filestore=None):
        """Create a new dataset from a given file. Returns the handle for the
        downloaded file only if the filestore has been provided as an argument
        in which case the file handle is meaningful file handle.

        Raises ValueError if the given file could not be loaded as a dataset.

        Parameters
        ----------
        url : string
            Unique resource identifier for external resource that is accessed
        username: string, optional
            Optional user name for authentication
        password: string, optional
            Optional password for authentication
        filestore: vizier.filestore.base.Filestore, optional
            Optional filestore to save a local copy of the downloaded resource

        Returns
        -------
        vizier.datastore.fs.dataset.FileSystemDatasetHandle,
        vizier.filestore.base.FileHandle
        """
        if not filestore is None:
            # Upload the file to the filestore to get the file handle
            fh = filestore.download_file(url=url,
                                         username=username,
                                         password=password)
            # Since the filestore was given we return a tuple of dataset
            # descriptor and file handle
            return self.load_dataset(fh), fh
        else:
            # Manually download the file temporarily
            temp_dir = tempfile.mkdtemp()
            try:
                response = urllib.request.urlopen(url)
                filename = get_download_filename(url, response.info())
                download_file = os.path.join(temp_dir, filename)
                mode = 'w'
                if filename.endswith('.gz'):
                    mode += 'b'
                with open(download_file, mode) as f:
                    f.write(response.read())
                fh = FileHandle(identifier=filename,
                                filepath=download_file,
                                file_name=filename)
                dataset = self.load_dataset(fh)
                shutil.rmtree(temp_dir)
                # Return only the dataset descriptor
                return dataset
            except Exception as ex:
                if os.path.isdir(temp_dir):
                    shutil.rmtree(temp_dir)
                raise ex
Пример #5
0
    def load_dataset(
        self,
        f_handle: FileHandle,
        proposed_schema: List[Tuple[str,
                                    str]] = []) -> FileSystemDatasetHandle:
        """Create a new dataset from a given file.

        Raises ValueError if the given file could not be loaded as a dataset.

        Parameters
        ----------
        f_handle : vizier.filestore.base.FileHandle
            Handle for an uploaded file

        Returns
        -------
        vizier.datastore.fs.dataset.FileSystemDatasetHandle
        """
        # The file handle might be None in which case an exception is raised
        if f_handle is None:
            raise ValueError('unknown file')
        # Expects a file in a supported tabular data format.
        if not f_handle.is_tabular:
            raise ValueError('cannot create dataset from file \'' +
                             f_handle.name + '\'')
        # Open the file as a csv file. Expects that the first row contains the
        # column names. Read dataset schema and dataset rows into two separate
        # lists.
        columns: List[DatasetColumn] = []
        rows: List[DatasetRow] = []
        with f_handle.open() as csvfile:
            reader = csv.reader(csvfile, delimiter=f_handle.delimiter)
            for col_name in next(reader):
                columns.append(
                    DatasetColumn(identifier=len(columns),
                                  name=col_name.strip()))
            for row in reader:
                values = [cast(v.strip()) for v in row]
                rows.append(
                    DatasetRow(identifier=str(len(rows)), values=values))
        # Get unique identifier and create subfolder for the new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        data_file = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(data_file).write(rows)
        # Create dataset an write descriptor to file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          data_file=data_file,
                                          row_count=len(rows),
                                          max_row_id=len(rows) - 1)
        dataset.to_file(
            descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE))
        return dataset
Пример #6
0
    def unload_dataset(self,
                       filepath,
                       dataset_name,
                       format='csv',
                       options=[],
                       filename=""):
        """Export a dataset from a given name.
        Raises ValueError if the given dataset could not be exported.
        Parameters
        ----------
        dataset_name: string
            Name of the dataset to unload
            
        format: string
            Format for output (csv, json, ect.)
            
        options: dict
            Options for data unload
            
        filename: string
            The output filename - may be empty if outputting to a database
        Returns
        -------
        vizier.filestore.base.FileHandle
        """
        name = os.path.basename(filepath).lower()
        basepath = filepath.replace(name, "")

        # Create a new unique identifier for the file.

        abspath = os.path.abspath((r'%s' % filepath))
        exported_files = mimir.unloadDataSource(dataset_name, abspath, format,
                                                options)
        file_handles = []
        for output_file in exported_files:
            name = os.path.basename(output_file).lower()
            identifier = get_unique_identifier()
            file_dir = os.path.join(basepath, identifier)
            if not os.path.isdir(file_dir):
                os.makedirs(file_dir)
            fs_output_file = os.path.join(file_dir, DATA_FILENAME)
            shutil.move(os.path.join(filepath, output_file), fs_output_file)
            f_handle = FileHandle(identifier, output_file, name)
            file_handles.append(f_handle)
            write_metadata_file(file_dir, f_handle)
        return file_handles
Пример #7
0
    def list_files(self):
        """Get list of file handles for all uploaded files.

        Returns
        -------
        list(vizier.filestore.base.FileHandle)
        """
        result = list()
        for f_name in os.listdir(self.base_path):
            dir_name = os.path.join(self.base_path, f_name)
            if os.path.isdir(dir_name):
                file_name, mimetype, encoding = read_metadata_file(dir_name)
                f_handle = FileHandle(f_name,
                                      filepath=os.path.join(
                                          dir_name, DATA_FILENAME),
                                      file_name=file_name,
                                      mimetype=mimetype,
                                      encoding=encoding)
                result.append(f_handle)
        return result
Пример #8
0
    def get_file(self, identifier):
        """Get handle for file with given identifier. Returns None if no file
        with given identifier exists.

        Parameters
        ----------
        identifier: string
            Unique file identifier

        Returns
        -------
        vizier.filestore.base.FileHandle
        """
        file_dir = self.get_file_dir(identifier)
        if os.path.isdir(file_dir):
            file_name, mimetype, encoding = read_metadata_file(file_dir)
            return FileHandle(identifier,
                              filepath=os.path.join(file_dir, DATA_FILENAME),
                              file_name=file_name,
                              mimetype=mimetype,
                              encoding=encoding)
        return None
from vizier.datastore.base import METADATA_FILE
from vizier.datastore.dataset import DatasetColumn, DatasetRow
from vizier.datastore.fs.base import FileSystemDatastore
from vizier.datastore.fs.base import DATA_FILE, DESCRIPTOR_FILE
from vizier.datastore.fs.base import validate_dataset
from vizier.filestore.fs.base import FileSystemFilestore
from vizier.filestore.base import FileHandle, FORMAT_TSV

BASE_DIR = './.tmp'
STORE_DIR = './.tmp/ds'
FSSTORE_DIR = './.tmp/fs'

FILE = FileHandle(
    identifier='0000',
    filepath='./tests/test_data/r.csv',
    file_name='r.csv'
)

# Note that some tests access an external resource to test download capabilities.
# The test will fail if the specified resource is not available. Set the
# DOWNLOAD_URL to an available resource or to None to skip the download tests
DOWNLOAD_URL = 'https://github.com/UBOdin/mimir-api/raw/master/test_data/r.csv'


EXAMPLE_PROPERTIES = {
  'columns': [
      { 'name': 'A',
        'structural_type': 'http://schema.org/Integer',
        'semantic_types': [],
        'unclean_values_ratio': 0.0,
from vizier.datastore.annotation.dataset import DatasetMetadata
from vizier.datastore.base import METADATA_FILE
from vizier.datastore.dataset import DatasetColumn, DatasetRow
from vizier.datastore.fs.base import FileSystemDatastore
from vizier.datastore.fs.base import DATA_FILE, DESCRIPTOR_FILE
from vizier.datastore.fs.base import validate_dataset
from vizier.filestore.fs.base import FileSystemFilestore
from vizier.filestore.base import FileHandle, FORMAT_TSV

BASE_DIR = './.tmp'
STORE_DIR = './.tmp/ds'
FSSTORE_DIR = './.tmp/fs'

FILE = FileHandle(
    identifier='0000',
    filepath='./.files/w49k-mmkh.tsv',
    file_name='w49k-mmkh.tsv'
)

# Note that some tests access an external resource to test download capabilities.
# The test will fail if the specified resource is not available. Set the
# DOWNLOAD_URL to an available resource or to None to skip the download tests
DOWNLOAD_URL = 'http://cds-swg1.cims.nyu.edu:8080/opendb-api/api/v1/datasets/w49k-mmkh/rows/download'

class TestFileSystemDatastore(unittest.TestCase):

    def setUp(self):
        """Create an empty datastore directory."""
        # Delete datastore directory if it exists
        if os.path.isdir(BASE_DIR):
            shutil.rmtree(BASE_DIR)