def select(self, *columns): """Creates a CassandraRDD with the select clause applied.""" new = copy(self) new._cjrdd = new._cjrdd.select(as_java_array(self.ctx._gateway, "String", (str(c) for c in columns))) new._jrdd = self._helper.parseRows(new._cjrdd, self.row_format) return new
def asDataFrames(self, *index_by): ''' Reads the spanned rows as DataFrames if pandas is available, or as a dict of numpy arrays if only numpy is available or as a dict with primitives and objects otherwise. @param index_by If pandas is available, the dataframes will be indexed by the given columns. ''' for c in index_by: if c in self.columns: raise ValueError( 'column %s cannot be used as index in the data' 'frames as it is a column by which the rows are spanned.') columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns)) jrdd = self._helper.spanBy(self._cjrdd, columns) rdd = RDD(jrdd, self.ctx) global pd if index_by and pd: return rdd.mapValues( lambda _: _.set_index(*[str(c) for c in index_by])) else: return rdd
def select(self, *columns): """Creates a CassandraRDD with the select clause applied.""" columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns)) new = copy(self) new._jrdd = new._jrdd.select(columns) return new
def where(self, clause, *args): """Creates a CassandraRDD with a CQL where clause applied. @param clause: The where clause, either complete or with ? markers @param *args: The parameters for the ? markers in the where clause. """ args = as_java_array(self.ctx._gateway, "Object", args) return self._specialize('where', *[clause, args])
def saveToCassandra(rdd, keyspace=None, table=None, columns=None, write_conf=None, row_format=None): ''' Saves an RDD to Cassandra. The RDD is expected to contain dicts with keys mapping to CQL columns. Arguments: @param rdd(RDD): The RDD to save. Equals to self when invoking saveToCassandra on a monkey patched RDD. @param keyspace(string):in The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same keyspace is used. @param table(string): The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same table is used. Keyword arguments: @param columns(iterable): The columns to save, i.e. which keys to take from the dicts in the RDD. If None given all columns are be stored. @param write_conf(WriteConf): A WriteConf object to use when saving to Cassandra @param row_format(RowFormat): Make explicit how to map the RDD elements into Cassandra rows. If None given the mapping is auto-detected as far as possible. ''' keyspace = keyspace or rdd.keyspace if not keyspace: raise ValueError("keyspace not set") table = table or rdd.table if not table: raise ValueError("table not set") # create write config as map and convert the columns to a string array write_conf = as_java_object(rdd.ctx._gateway, write_conf.__dict__) if write_conf else None columns = as_java_array(rdd.ctx._gateway, "String", columns) if columns else None # create a helper object helper = rdd.ctx._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("pyspark_cassandra.PythonHelper").newInstance() # delegate to helper helper \ .saveToCassandra( rdd._jrdd, keyspace, table, columns, write_conf, row_format )
def where(self, clause, *args): """Creates a CassandraRDD with a CQL where clause applied. @param clause: The where clause, either complete or with ? markers @param *args: The parameters for the ? markers in the where clause. """ args = as_java_array(self.ctx._gateway, "Object", args) new = copy(self) new._jrdd = new._jrdd.where(clause, args) return new
def saveToCassandra(rdd, keyspace=None, table=None, columns=None, row_format=None, keyed=None, write_conf=None, **write_conf_kwargs): ''' Saves an RDD to Cassandra. The RDD is expected to contain dicts with keys mapping to CQL columns. Arguments: @param rdd(RDD): The RDD to save. Equals to self when invoking saveToCassandra on a monkey patched RDD. @param keyspace(string):in The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same keyspace is used. @param table(string): The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same table is used. Keyword arguments: @param columns(iterable): The columns to save, i.e. which keys to take from the dicts in the RDD. If None given all columns are be stored. @param row_format(RowFormat): Make explicit how to map the RDD elements into Cassandra rows. If None given the mapping is auto-detected as far as possible. @param keyed(bool): Make explicit that the RDD consists of key, value tuples (and not arrays of length two). @param write_conf(WriteConf): A WriteConf object to use when saving to Cassandra @param **write_conf_kwargs: WriteConf parameters to use when saving to Cassandra ''' keyspace = keyspace or getattr(rdd, 'keyspace', None) if not keyspace: raise ValueError("keyspace not set") table = table or getattr(rdd, 'table', None) if not table: raise ValueError("table not set") # create write config as map write_conf = WriteConf.build(write_conf, **write_conf_kwargs) write_conf = as_java_object(rdd.ctx._gateway, write_conf.settings()) # convert the columns to a string array columns = as_java_array(rdd.ctx._gateway, "String", columns) if columns else None helper(rdd.ctx) \ .saveToCassandra( rdd._jrdd, keyspace, table, columns, row_format, keyed, write_conf, )
def asDataFrames(self, *index_by): ''' Reads the spanned rows as DataFrames if pandas is available, or as a dict of numpy arrays if only numpy is available or as a dict with primitives and objects otherwise. @param index_by If pandas is available, the dataframes will be indexed by the given columns. ''' for c in index_by: if c in self.columns: raise ValueError('column %s cannot be used as index in the data' 'frames as it is a column by which the rows are spanned.') columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns)) jrdd = self._helper.spanBy(self._cjrdd, columns) rdd = RDD(jrdd, self.ctx) global pd if index_by and pd: return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by])) else: return rdd
def on(self, *columns): columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns)) return self._specialize('on', columns)
def select(self, *columns): """Creates a CassandraRDD with the select clause applied.""" columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns)) return self._specialize('select', columns)