Пример #1
0
    def next(self):
        """Return the next row in the dataset iterator. Raises StopIteration if
        end of file is reached or file has been closed.

        Automatically closes any open file when end of iteration is reached for
        the first time.

        Returns
        -------
        vizier.datastore.base.DatasetRow
        """
        if self.is_open:
            # Catch exception to close any open file
            try:
                row = self.reader.next()
                if self.has_row_ids:
                    row = DatasetRow(int(row[0]), row[1:])
                else:
                    row = DatasetRow(self.line_count, row)
                self.line_count += 1
                return row
            except StopIteration as ex:
                self.close()
                raise ex
        raise StopIteration
Пример #2
0
 def test_default_json_reader(self):
     """Test functionality of Json dataset reader."""
     reader = DefaultJsonDatasetReader(JSON_FILE)
     with self.assertRaises(StopIteration):
         reader.next()
     count = 0
     with reader.open() as r:
         for row in r:
             self.assertEquals(len(row.values), 3)
             self.assertEquals(row.identifier, count)
             count += 1
     self.assertEquals(count, 2)
     with self.assertRaises(StopIteration):
         reader.next()
     # Create a new dataset and read it
     tmp_file = tempfile.mkstemp()[1]
     reader = DefaultJsonDatasetReader(tmp_file)
     values = ['A', 'B', 1, 2]
     rows = [
         DatasetRow(0, values),
         DatasetRow(1, values),
         DatasetRow(2, values)
     ]
     reader.write(rows)
     count = 0
     with reader.open() as reader:
         for row in reader:
             self.assertEquals(len(row.values), 4)
             self.assertEquals(row.identifier, count)
             count += 1
     self.assertEquals(count, len(rows))
     os.remove(tmp_file)
Пример #3
0
    def next(self):
        """Return the next row in the dataset iterator. Raises StopIteration if
        end of file is reached or file has been closed.

        Automatically closes any open file when end of iteration is reached for
        the first time.

        Returns
        -------
        vizier.datastore.base.DatasetRow
        """
        if self.is_open:
            if self.read_index < len(self.rows):
                row = DatasetRow.from_dict(self.rows[self.read_index])
                # Set the annotation flags in the dataset row
                if not self.annotations is None:
                    for i in range(len(self.columns)):
                        col = self.columns[i]
                        has_anno = self.annotations.has_cell_annotation(
                            col.identifier,
                            row.identifier
                        )
                        if has_anno:
                            row.cell_annotations[i] = True
                self.read_index += 1
                return row
        raise StopIteration
Пример #4
0
    def from_file(f_handle):
        """Read dataset from file. Expects the file to be in Json format which
        is the default serialization format used by to_file().

        Parameters
        ----------
        f_handle : vizier.filestore.base.FileHandle
            Handle for an uploaded file on a file server

        Returns
        -------
        vizier.datastore.base.Dataset
        """
        # Expects a CSV/TSV file. The first row contains the column names.
        # Read all information and return a InMemDatasetHandle
        if not f_handle.is_verified_csv:
            raise ValueError('failed to create dataset from file \'' + f_handle.name + '\'')
        # Read all information and return a InMemDatasetHandle
        columns = []
        rows = []
        with f_handle.open() as csvfile:
            reader = csv.reader(csvfile, delimiter=f_handle.delimiter)
            for col_name in reader.next():
                columns.append(DatasetColumn(len(columns), col_name.strip()))
            for row in reader:
                values = [cast(v.strip()) for v in row]
                rows.append(DatasetRow(len(rows), values))
        # Return InMemDatasetHandle
        return InMemDatasetHandle(
            identifier=get_unique_identifier(),
            columns=columns,
            rows=rows,
            column_counter=len(columns),
            row_counter=len(rows)
        )
Пример #5
0
    def get_dataset(self, identifier):
        """Read a full dataset from the data store. Returns None if no dataset
        with the given identifier exists.

        Parameters
        ----------
        identifier : string
            Unique dataset identifier

        Returns
        -------
        vizier.datastore.base.DatasetHandle
        """
        if identifier in self.datasets:
            dataset = self.datasets[identifier]
            return InMemDatasetHandle(
                identifier=identifier,
                columns=[
                    DatasetColumn(col.identifier, col.name)
                        for col in dataset.columns
                ],
                rows=[
                    DatasetRow(row.identifier, list(row.values))
                        for row in dataset.fetch_rows()
                ],
                column_counter=dataset.column_counter,
                row_counter=dataset.row_counter,
                annotations=dataset.annotations.copy_metadata()
            )
Пример #6
0
    def filter_columns(self, identifier, columns, names):
        """Dataset projection operator. Returns a copy of the dataset with the
        given identifier that contains only those columns listed in columns.
        The list of names contains optional new names for the filtered columns.
        A value of None in names indicates that the name of the corresponding
        column is not changed.

        Returns the number of rows in the dataset and the identifier of the
        projected dataset.

        Raises ValueError if no dataset with given identifier exists or if any
        of the filter columns are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for columns in the result.
        names: list(string)
            Optional new names for filtered columns.

        Returns
        -------
        int, string
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = self.datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # The schema of the new dataset only contains the columns in the given
        # list. Keep track of their index positions to filter values.
        schema = list()
        val_filter = list()
        for i in range(len(columns)):
            col_idx = get_index_for_column(dataset, columns[i])
            col = dataset.columns[col_idx]
            if not names[i] is None:
                schema.append(
                    DatasetColumn(identifier=col.identifier, name=names[i]))
            else:
                schema.append(col)
            val_filter.append(col_idx)
        # Create a list of projected rows
        rows = list()
        for row in dataset.fetch_rows():
            values = list()
            for v_idx in val_filter:
                values.append(row.values[v_idx])
            rows.append(DatasetRow(identifier=row.identifier, values=values))
        # Store updated dataset to get new identifier
        ds = self.datastore.create_dataset(
            columns=schema,
            rows=rows,
            column_counter=dataset.column_counter,
            row_counter=dataset.row_counter,
            annotations=dataset.annotations.filter_columns(columns))
        return len(rows), ds.identifier
Пример #7
0
    def open(self):
        """Setup the reader by querying the database and creating an in-memory
        copy of the dataset rows.

        Returns
        -------
        vizier.datastore.reader.MimirDatasetReader
        """
        # Query the database to retrieve dataset rows if reader is not already
        # open
        if not self.is_open:
            # Query the database to get the list of rows. Sort rows according to
            # order in row_ids and return a InMemReader
            sql = get_select_query(self.table_name, columns=self.columns)
            if self.rowid != None:
                sql += ' WHERE ROWID() = ' + str(self.rowid)
            if self.is_range_query:
                sql += ' LIMIT ' + str(self.limit) + ' OFFSET ' + str(
                    self.offset)
            rs = json.loads(
                mimir._mimir.vistrailsQueryMimirJson(sql, True, False))
            self.row_ids = rs['prov']
            # Initialize mapping of column rdb names to index positions in
            # dataset rows
            self.col_map = dict()
            for i in range(len(rs['schema'])):
                col = rs['schema'][i]
                self.col_map[col['name']] = i
            # Initialize rows (make sure to sort them according to order in
            # row_ids list), read index and open flag
            rowid_idx = self.col_map[ROW_ID]
            # Filter rows if this is a range query (needed until IN works)
            rs_rows = rs['data']
            self.rows = list()
            for row_index in range(len(rs_rows)):
                row = rs_rows[row_index]
                row_id = str(row[self.col_map[ROW_ID]])
                values = [None] * len(self.columns)
                row_annos = [False] * len(values)
                for i in range(len(self.columns)):
                    col = self.columns[i]
                    col_index = self.col_map[col.name_in_rdb]
                    values[i] = row[col_index]
                    has_anno = self.annotations.has_cell_annotation(
                        col.identifier, row_id)
                    if not has_anno:
                        # Check if the cell taint is true
                        has_anno = not rs['col_taint'][row_index][col_index]
                    row_annos[i] = has_anno
                self.rows.append(
                    DatasetRow(row_id, values, annotations=row_annos))
            self.rows.sort(key=lambda row: self.sortbyrowid(row.identifier))
            self.read_index = 0
            self.is_open = True
        return self
Пример #8
0
    def update_cell(self, identifier, column, row, value):
        """Update a cell in a given dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified cell is outside of the current dataset ranges.

        Parameters
        ----------
        identifier : string
            Unique dataset identifier
        column : int
            Unique column identifier
        row : int
            Row index for updated cell (starting at 0)
        value : string
            New cell value

        Returns
        -------
        int, string
            Number of updated rows (i.e., 1) and identifier of resulting
            dataset
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = self.datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get column index forst in case it raises an exception
        col_idx = get_index_for_column(dataset, column)
        # Make sure that row refers a valid row in the dataset
        rows = dataset.fetch_rows()
        if row < 0 or row >= len(rows):
            raise ValueError('invalid cell [' + str(column) + ', ' + str(row) +
                             ']')
        # Update the specified cell in the given data array
        r = rows[row]
        values = list(r.values)
        values[col_idx] = value
        rows[row] = DatasetRow(r.identifier, values)
        # Store updated dataset to get new identifier
        ds = self.datastore.create_dataset(
            columns=dataset.columns,
            rows=rows,
            column_counter=dataset.column_counter,
            row_counter=dataset.row_counter,
            annotations=dataset.annotations)
        return 1, ds.identifier
Пример #9
0
    def insert_row(self, identifier, position):
        """Insert row at given position in a dataset.

        Raises ValueError if no dataset with given identifier exists or if the
        specified row psotion isoutside the dataset bounds.

        Parameters
        ----------
        identifier : string
            Unique dataset identifier
        position : int
            Index position at which the row will be inserted

        Returns
        -------
        int, string
            Number of inserted rows (i.e., 1) and identifier of resulting
            dataset
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = self.datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid row index in the new dataset
        rows = dataset.fetch_rows()
        if position < 0 or position > len(rows):
            raise ValueError('invalid row index \'' + str(position) + '\'')
        # Create empty set of values
        row = DatasetRow(dataset.row_counter, [None] * len(dataset.columns))
        rows.insert(position, row)
        # Store updated dataset to get new identifier
        ds = self.datastore.create_dataset(
            columns=dataset.columns,
            rows=rows,
            column_counter=dataset.column_counter,
            row_counter=dataset.row_counter + 1,
            annotations=dataset.annotations)
        return 1, ds.identifier