示例#1
0
def parse_columns(dataset, formula, name, dframe=None, no_index=False):
    """Parse a formula and return columns resulting from its functions.

    Parse a formula into a list of functions then apply those functions to
    the Data Frame and return the resulting columns.

    :param formula: The formula to parse.
    :param name: Name of the formula.
    :param dframe: A DataFrame to apply functions to.
    :param no_index: Drop the index on result columns.
    """
    functions = Parser.parse_functions(formula)
    dependent_columns = Parser.dependent_columns(formula, dataset)

    # make select from dependent_columns
    if dframe is None:
        select = {col: 1 for col in dependent_columns or [MONGO_ID]}

        dframe = dataset.dframe(
            query_args=QueryArgs(select=select),
            keep_mongo_keys=True).set_index(MONGO_ID_ENCODED)

        if not dependent_columns:
            # constant column, use dummy
            dframe['dummy'] = 0

    return __build_columns(dataset, dframe, functions, name, no_index)
示例#2
0
 def eval(self, row, dataset):
     # parse date from string
     col = self.value.value
     query_args = QueryArgs(select={col: 1})
     column = dataset.dframe(query_args=query_args)[col]
     field = self.value.field(row)
     return percentileofscore(column, field)
    def test_plot_select(self):
        column = 'community_pop'
        select = {column: 1}
        result = self.controller.plot(self.dataset_id,
                                      select=json.dumps(select))

        dframe = self.dataset.dframe(QueryArgs(select=select))
        self.__test_result(result, dframe)
    def test_plot(self):
        result = self.controller.plot(self.dataset_id)

        dframe = self.dataset.dframe(query_args=QueryArgs(
            select=self.dataset.schema.numerics_select))
        dframe = dframe.dropna()

        self.__test_result(result, dframe)
示例#5
0
    def __parse_query_args(self, limit, order_by, query, select,
                           distinct=None, dataset=None):
            limit = parse_int(limit, 0)
            query = self.__parse_query(query)
            select = self.__parse_select(select)

            return QueryArgs(query=query, select=select, distinct=distinct,
                             limit=limit, order_by=order_by, dataset=dataset)
示例#6
0
    def observations(self, query_args=None, as_cursor=False):
        """Return observations for this dataset.

        :param query_args: An optional QueryArgs to hold the query arguments.
        :param as_cursor: Return the observations as a cursor.
        """
        return Observation.find(self,
                                query_args or QueryArgs(),
                                as_cursor=as_cursor)
示例#7
0
    def test_find_with_select(self):
        self.__save_records()
        query_args = QueryArgs(select={"rating": 1})
        rows = Observation.find(self.dataset, query_args)

        self.assertTrue(isinstance(rows, list))

        row = self.__decode(rows[0])

        self.assertEquals(sorted(row.keys()), ['_id', 'rating'])
示例#8
0
    def rolling(self, win_type, window):
        """Calculate a rolling window over all numeric columns.

        :param win_type: The type of window, see pandas pandas.rolling_window.
        :param window: The number of observations used for calculating the
            window.
        :returns: A DataFrame of the rolling window calculated for this
            dataset.
        """
        dframe = self.dframe(QueryArgs(select=self.schema.numerics_select))
        return rolling_window(dframe, window, win_type)
示例#9
0
    def resample(self, date_column, interval, how, query=None):
        """Resample a dataset given a new time frame.

        :param date_column: The date column use as the index for resampling.
        :param interval: The interval code for resampling.
        :param how: How to aggregate in the resample.
        :returns: A DataFrame of the resampled DataFrame for this dataset.
        """
        query_args = QueryArgs(query=query)
        dframe = self.dframe(query_args).set_index(date_column)
        resampled = dframe.resample(interval, how=how)
        return resampled.reset_index()
    def test_plot_index(self):
        dataset_id = self._post_file()
        dataset = Dataset.find_one(dataset_id)

        column = 'amount'
        select = {column: 1}
        result = self.controller.plot(dataset_id,
                                      select=json.dumps(select),
                                      index='submit_date')
        dframe = dataset.dframe()

        dframe = self.dataset.dframe(QueryArgs(select=select))
        self.__test_result(result, dframe)
示例#11
0
    def dframe(self,
               query_args=None,
               keep_parent_ids=False,
               padded=False,
               index=False,
               reload_=False,
               keep_mongo_keys=False):
        """Fetch the dframe for this dataset.

        :param query_args: An optional QueryArgs to hold the query arguments.
        :param keep_parent_ids: Do not remove parent IDs from the dframe,
            default False.
        :param padded: Used for joining, default False.
        :param index: Return the index with dframe, default False.
        :param reload_: Force refresh of data, default False.
        :param keep_mongo_keys: Used for updating documents, default False.

        :returns: Return DataFrame with contents based on query parameters
            passed to MongoDB. DataFrame will not have parent ids if
            `keep_parent_ids` is False.
        """
        # bypass cache if we need specific version
        cacheable = not (query_args or keep_parent_ids or padded)

        # use cached copy if we have already fetched it
        if cacheable and not reload_ and self.__is_cached:
            return self.__dframe

        query_args = query_args or QueryArgs()
        observations = self.observations(query_args, as_cursor=True)

        if query_args.distinct:
            return DataFrame(observations)

        dframe = Observation.batch_read_dframe_from_cursor(
            self, observations, query_args.distinct, query_args.limit)

        dframe = df_mongo_decode(dframe, keep_mongo_keys=keep_mongo_keys)

        excluded = [keep_parent_ids and PARENT_DATASET_ID, index and INDEX]
        dframe = remove_reserved_keys(dframe, filter(bool, excluded))

        if index:
            dframe.rename(columns={INDEX: 'index'}, inplace=True)

        dframe = self.__maybe_pad(dframe, padded)

        if cacheable:
            self.__dframe = dframe

        return dframe
示例#12
0
    def count(self, query_args=None):
        """Return the count of rows matching query in dataset.

        :param query_args: An optional QueryArgs to hold the query arguments.
        """
        query_args = query_args or QueryArgs()
        obs = self.observations(query_args, as_cursor=True)

        count = len(obs) if query_args.distinct else obs.count()

        limit = query_args.limit
        if limit > 0 and count > limit:
            count = limit

        return count
示例#13
0
def __create_aggregator(dataset, formula, name, groups, dframe=None):
    # TODO this should work with index eventually
    columns = parse_columns(dataset, formula, name, dframe, no_index=True)

    dependent_columns = Parser.dependent_columns(formula, dataset)
    aggregation = Parser.parse_aggregation(formula)

    # get dframe with only the necessary columns
    select = combine_dicts({group: 1 for group in groups},
                           {col: 1 for col in dependent_columns})

    # ensure at least one column (MONGO_ID) for the count aggregation
    query_args = QueryArgs(select=select or {MONGO_ID: 1})
    dframe = dataset.dframe(query_args=query_args, keep_mongo_keys=not select)

    return Aggregator(dframe, groups, aggregation, name, columns)
示例#14
0
    def find(cls, dataset, include_aggs=True, only_aggs=False):
        """Return the calculations for`dataset`.

        :param dataset: The dataset to retrieve the calculations for.
        :param include_aggs: Include aggregations, default True.
        :param only_aggs: Exclude non-aggregations, default False.
        """
        query = {DATASET_ID: dataset.dataset_id}

        if not include_aggs:
            query[cls.AGGREGATION] = None

        if only_aggs:
            query[cls.AGGREGATION] = {'$ne': None}

        query_args = QueryArgs(query=query, order_by='name')
        return super(cls, cls).find(query_args)
    def test_delete_with_query(self):
        dataset_id = self._post_file()
        query = {'food_type': 'caffeination'}
        dataset = Dataset.find_one(dataset_id)
        dframe = dataset.dframe(query_args=QueryArgs(query=query))
        len_after_delete = len(dataset.dframe()) - len(dframe)

        query = json.dumps(query)
        result = json.loads(self.controller.delete(dataset_id, query=query))
        message = result[Datasets.SUCCESS]

        self.assertTrue('deleted dataset' in message)
        self.assertTrue(query in message)
        self.assertEqual(result[Dataset.ID], dataset_id)

        dframe = Dataset.find_one(dataset_id).dframe()

        self.assertEqual(len(dframe), len_after_delete)
示例#16
0
        def action(dataset, query=query, select=select, limit=limit):
            if not dataset.is_ready:
                raise ArgumentError('dataset is not finished importing')

            limit = parse_int(limit, 0)
            query = self.__parse_query(query)
            select = self.__parse_select(select, required=True)

            groups = dataset.split_groups(group)
            [valid_column(dataset, c) for c in groups]

            # if select append groups to select
            if select:
                select.update(dict(zip(groups, [1] * len(groups))))

            query_args = QueryArgs(query=query, select=select, limit=limit,
                                   order_by=order_by)
            dframe = dataset.dframe(query_args)

            return dataset.summarize(dframe, groups=groups,
                                     no_cache=query or select, flat=flat)
示例#17
0
def __update_is_valid(dataset, new_dframe):
    """Check if the update is valid.

    Check whether this is a right-hand side of any joins
    and deny the update if the update would produce an invalid
    join as a result.

    :param dataset: The dataset to check if update valid for.
    :param new_dframe: The update dframe to check.
    :returns: True is the update is valid, False otherwise.
    """
    select = {on: 1 for on in dataset.on_columns_for_rhs_of_joins if on in
              new_dframe.columns and on in dataset.columns}
    dframe = dataset.dframe(query_args=QueryArgs(select=select))

    for on in select.keys():
        merged_join_column = concat([new_dframe[on], dframe[on]])

        if len(merged_join_column) != merged_join_column.nunique():
            return False

    return True
示例#18
0
    def find(cls,
             dataset,
             query_args=None,
             as_cursor=False,
             include_deleted=False):
        """Return observation rows matching parameters.

        :param dataset: Dataset to return rows for.
        :param include_deleted: If True, return delete records, default False.
        :param query_args: An optional QueryArgs to hold the query arguments.

        :raises: `JSONError` if the query could not be parsed.

        :returns: A list of dictionaries matching the passed in `query` and
            other parameters.
        """
        encoding = cls.encoding(dataset) or {}
        query_args = query_args or QueryArgs()

        query_args.query = parse_timestamp_query(query_args.query,
                                                 dataset.schema)
        query_args.encode(encoding, {DATASET_ID: dataset.dataset_id})

        if not include_deleted:
            query = query_args.query
            query[cls.DELETED_AT] = 0
            query_args.query = query

        # exclude deleted at column
        query_args.select = query_args.select or {cls.DELETED_AT: 0}

        distinct = query_args.distinct
        records = super(cls, cls).find(query_args,
                                       as_dict=True,
                                       as_cursor=(as_cursor or distinct))

        return records.distinct(encoding.get(distinct, distinct)) if distinct\
            else records
示例#19
0
 def setUp(self):
     TestBase.setUp(self)
     self.dataset = Dataset()
     self.dataset.save(self.test_dataset_ids['good_eats.csv'])
     self.query_args = QueryArgs({"rating": "delectible"})
示例#20
0
 def parent_ids(self):
     query_args = QueryArgs(select={PARENT_DATASET_ID: 1},
                            distinct=PARENT_DATASET_ID)
     return self.observations(query_args)
示例#21
0
 def find(cls, dataset_id):
     """Return datasets for `dataset_id`."""
     query_args = QueryArgs(query={DATASET_ID: dataset_id})
     return super(cls, cls).find(query_args)