def test_move_dataset(self): fo = get_user(username2) fo_site = connect(fo.email, password2, HOST) # These two datasets are created by the default logged user _ds1 = site.datasets.create( shoji_entity_wrapper({'name': 'test_move_dataset1'})).refresh() _ds2 = site.datasets.create( shoji_entity_wrapper({'name': 'test_move_dataset2'})).refresh() # This dataset is created and owned by the other user _ds4 = fo_site.datasets.create( shoji_entity_wrapper({ 'name': 'test_move_dataset4', 'owner': fo.url })).refresh() ds1 = get_dataset(_ds1.body.id) ds2 = get_dataset(_ds2.body.id) ds4 = get_dataset(_ds4.body.id, connection=fo_site) ds2.add_user(fo, edit=True) # Create a hierarchy A -> B pa = new_project("test_move_dataset_A") pa.move_here([ds1]) # Put ds1 in A pb = pa.create_project("test_move_dataset_B") pa.add_user(fo, edit=True) # Move ds4 to B as the other user fo_pa = get_project(pa.name, fo_site) fo_pa.place(ds4, path="| %s" % pb.name) pb.resource.refresh() self.assertItemsEqual( pb.resource.index.keys(), # Only ds4 here [_ds4.self]) fo_ds1 = get_dataset(_ds1.body.id, connection=fo_site) fo_pa.place(fo_ds1, path="| %s" % pb.name) pb.resource.refresh() self.assertItemsEqual(pb.resource.index.keys(), [_ds1.self, _ds4.self]) pa.place(ds2, path="| %s" % pb.name) pb.resource.refresh() self.assertItemsEqual(pb.resource.index.keys(), [_ds1.self, _ds2.self, _ds4.self]) self.assertEqual(ds2.resource.project.self, pb.url)
def move_to_categorical_array(self, name, alias, subvariables, description='', notes=''): """ This is a dangerous method that allows moving variables (effectively translating them as variables in a dataset) as subvariables in the newly created categorical_array created. :param: name: Name of the new variable. :param: alias: Alias of the new variable :param: subvariables: A list of existing Dataset variables aliases to move into the new variable as subvariables .i.e; subvariables = ['var1_alias', 'var2_alias'] :param: description: A description of the new variable :param: notes: Notes to attach to the new variable """ payload = { 'name': name, 'alias': alias, 'description': description, 'notes': notes, 'type': 'categorical_array', 'subvariables': [self[v].url for v in subvariables] } self.resource.variables.create(shoji_entity_wrapper(payload)) self._reload_variables() return self[alias]
def push_rows(self, count=None): """ Batches in the rows that have been recently streamed. This forces the rows to appear in the dataset instead of waiting for crunch automatic batcher process. """ if bool(self.resource.stream.body.pending_messages): self.resource.batches.create( shoji_entity_wrapper({ 'stream': count, 'type': 'ldjson'} ))
def append_dataset(self, dataset, filter=None, variables=None, autorollback=True, delete_pk=True): """ Append dataset into self. If this operation fails, the append is rolledback. Dataset variables and subvariables are matched on their aliases and categories are matched by name. :param: dataset: Daatset instance to append from :param: filter: An expression to filter dataset rows. cannot be a Filter according to: http://docs.crunch.io/#get211 :param: variables: A list of variable names to include from dataset """ if self.url == dataset.url: raise ValueError("Cannot append dataset to self") if variables and not isinstance(variables, list): raise AttributeError( "'variables' must be a list of variable names") if delete_pk: LOG.info( "Any pk's found will be deleted, to avoid these pass delete_pk=False" ) self.resource.pk.delete() dataset.resource.pk.delete() payload = shoji_entity_wrapper({'dataset': dataset.url}) payload['autorollback'] = autorollback if variables: id_vars = [] for var in variables: id_vars.append(dataset[var].url) # build the payload with selected variables payload['body']['where'] = { 'function': 'select', 'args': [{ 'map': {x: { 'variable': x } for x in id_vars} }] } if filter: # parse the filter expression payload['body']['filter'] = process_expr(parse_expr(filter), dataset.resource) return self.resource.batches.create(payload)
def create_dataset(name, variables, connection=None, **kwargs): if connection is None: connection = _get_connection() if not connection: raise AttributeError( "Authenticate first with scrunch.connect() or by providing " "config/environment variables") dataset_doc = { 'name': name, 'table': { 'element': 'crunch:table', 'metadata': variables } } dataset_doc.update(**kwargs) shoji_ds = connection.datasets.create( shoji_entity_wrapper(dataset_doc)).refresh() return MutableDataset(shoji_ds)
def move_to_multiple_response(self, name, alias, subvariables, description='', notes=''): """ This method is a replication of the method move_to_categorical_array, only this time we are creting a multiple_response variable. Note: the subvariables need to have at least 1 selected catagory. """ payload = { 'name': name, 'alias': alias, 'description': description, 'notes': notes, 'type': 'multiple_response', 'subvariables': [self[v].url for v in subvariables] } self.resource.variables.create(shoji_entity_wrapper(payload)) self._reload_variables() return self[alias]
def join(self, left_var, right_ds, right_var, columns=None, filter=None, timeout=30): """ Joins a given variable. In crunch joins are left joins, where left is the dataset variable and right is other dataset variable. For more information see: http://docs.crunch.io/?http#merging-and-joining-datasets :param: columns: Specify a list of variables from right dataset to bring in the merge: http://docs.crunch.io/?http#joining-a-subset-of-variables :param: wait: Wait for the join progress to finish by polling or simply return a url to the progress resource :param: filter: Filters out rows based on the given expression, or on a given url for an existing filter. TODO: for the moment we only allow expressions """ right_var_url = right_ds[right_var].url left_var_url = self[left_var].url # this dictionary sets the main part of the join adapter = { 'function': 'adapt', 'args': [{ 'dataset': right_ds.url }, { 'variable': right_var_url }, { 'variable': left_var_url }] } # wrap the adapter method on a shoji and body entity payload = shoji_entity_wrapper(adapter) if columns and isinstance(columns, list): # overwrite body to new format payload['body'] = { 'frame': adapter, 'function': 'select', 'args': [{ 'map': {} }] } # add the individual variable columns to the payload alias_list = right_ds.resource.variables.by("alias") var_urls = [alias_list[alias].entity_url for alias in columns] var_url_list = { var_url: { "variable": var_url } for var_url in var_urls } payload['body']['args'][0]['map'] = var_url_list if filter: # in the case of a filter, convert it to crunch # and attach the filter to the payload expr = process_expr(parse_expr(filter), right_ds) payload['body']['filter'] = {'expression': expr} progress = self.resource.variables.post(payload) # poll for progress to finish or return the url to progress progress_tracker = DefaultProgressTracking(timeout) return wait_progress(r=progress, session=self.resource.session, progress_tracker=progress_tracker, entity=self)
def new_project(name): res = site.projects.create(shoji_entity_wrapper({ "name": name + UNIQUE_PREFIX })).refresh() return Project(res)