def __init__(self, keyspace, table, ctx, row_format=None, read_conf=None): if not keyspace: raise ValueError("keyspace not set") if not table: raise ValueError("table not set") if row_format is None: row_format = RowFormat.ROW elif row_format < 0 or row_format >= len(RowFormat.values): raise ValueError("invalid row_format %s" % row_format) self.keyspace = keyspace self.table = table self.read_conf = read_conf self.row_format = row_format jread_conf = as_java_object(ctx._gateway, read_conf.__dict__) if read_conf else None self._helper = ctx._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("pyspark_cassandra.PythonHelper").newInstance() self._cjrdd = self._helper \ .cassandraTable( keyspace, table, ctx._jsc, jread_conf, ) jrdd = self._helper.parseRows(self._cjrdd, row_format) super(CassandraRDD, self).__init__(jrdd, ctx)
def __init__(self, keyspace, table, ctx, row_format=None, read_conf=None): if not keyspace: raise ValueError("keyspace not set") if not table: raise ValueError("table not set") if not row_format: row_format = RowFormat.ROW elif row_format < 0 or row_format >= len(RowFormat.values): raise ValueError("invalid row_format %s" % row_format) self.keyspace = keyspace self.table = table self.read_conf = read_conf self.row_format = row_format jread_conf = as_java_object(ctx._gateway, read_conf.__dict__) if read_conf else None self._helper = ctx._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("pyspark_cassandra.PythonHelper").newInstance() self._cjrdd = self._helper \ .cassandraTable( keyspace, table, ctx._jsc, jread_conf, ) jrdd = self._helper.parseRows(self._cjrdd, row_format) super(CassandraRDD, self).__init__(jrdd, ctx)
def saveToCassandra(rdd, keyspace=None, table=None, columns=None, write_conf=None, row_format=None): ''' Saves an RDD to Cassandra. The RDD is expected to contain dicts with keys mapping to CQL columns. Arguments: @param rdd(RDD): The RDD to save. Equals to self when invoking saveToCassandra on a monkey patched RDD. @param keyspace(string):in The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same keyspace is used. @param table(string): The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same table is used. Keyword arguments: @param columns(iterable): The columns to save, i.e. which keys to take from the dicts in the RDD. If None given all columns are be stored. @param write_conf(WriteConf): A WriteConf object to use when saving to Cassandra @param row_format(RowFormat): Make explicit how to map the RDD elements into Cassandra rows. If None given the mapping is auto-detected as far as possible. ''' keyspace = keyspace or rdd.keyspace if not keyspace: raise ValueError("keyspace not set") table = table or rdd.table if not table: raise ValueError("table not set") # create write config as map and convert the columns to a string array write_conf = as_java_object(rdd.ctx._gateway, write_conf.__dict__) if write_conf else None columns = as_java_array(rdd.ctx._gateway, "String", columns) if columns else None # create a helper object helper = rdd.ctx._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("pyspark_cassandra.PythonHelper").newInstance() # delegate to helper helper \ .saveToCassandra( rdd._jrdd, keyspace, table, columns, write_conf, row_format )
def saveToCassandra(rdd, keyspace=None, table=None, columns=None, row_format=None, keyed=None, write_conf=None, **write_conf_kwargs): ''' Saves an RDD to Cassandra. The RDD is expected to contain dicts with keys mapping to CQL columns. Arguments: @param rdd(RDD): The RDD to save. Equals to self when invoking saveToCassandra on a monkey patched RDD. @param keyspace(string):in The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same keyspace is used. @param table(string): The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same table is used. Keyword arguments: @param columns(iterable): The columns to save, i.e. which keys to take from the dicts in the RDD. If None given all columns are be stored. @param row_format(RowFormat): Make explicit how to map the RDD elements into Cassandra rows. If None given the mapping is auto-detected as far as possible. @param keyed(bool): Make explicit that the RDD consists of key, value tuples (and not arrays of length two). @param write_conf(WriteConf): A WriteConf object to use when saving to Cassandra @param **write_conf_kwargs: WriteConf parameters to use when saving to Cassandra ''' keyspace = keyspace or getattr(rdd, 'keyspace', None) if not keyspace: raise ValueError("keyspace not set") table = table or getattr(rdd, 'table', None) if not table: raise ValueError("table not set") # create write config as map write_conf = WriteConf.build(write_conf, **write_conf_kwargs) write_conf = as_java_object(rdd.ctx._gateway, write_conf.settings()) # convert the columns to a string array columns = as_java_array(rdd.ctx._gateway, "String", columns) if columns else None helper(rdd.ctx) \ .saveToCassandra( rdd._jrdd, keyspace, table, columns, row_format, keyed, write_conf, )
def __init__(self, ctx, keyspace, table, row_format=None, read_conf=None, **read_conf_kwargs): super(CassandraTableScanRDD, self).__init__(ctx, keyspace, table, row_format, read_conf, **read_conf_kwargs) self._key_by = ColumnSelector.none() read_conf = as_java_object(ctx._gateway, self.read_conf.settings()) self.crdd = self._helper \ .cassandraTable( ctx._jsc, keyspace, table, read_conf, )