Пример #1
0
	def select(self, *columns):
		"""Creates a CassandraRDD with the select clause applied.""" 

		new = copy(self)
		new._cjrdd = new._cjrdd.select(as_java_array(self.ctx._gateway, "String", (str(c) for c in columns)))
		new._jrdd = self._helper.parseRows(new._cjrdd, self.row_format)
		return new
Пример #2
0
    def asDataFrames(self, *index_by):
        '''
			Reads the spanned rows as DataFrames if pandas is available, or as
			a dict of numpy arrays if only numpy is available or as a dict with
			primitives and objects otherwise.
			
			@param index_by If pandas is available, the dataframes will be
			indexed by the given columns.
		'''
        for c in index_by:
            if c in self.columns:
                raise ValueError(
                    'column %s cannot be used as index in the data'
                    'frames as it is a column by which the rows are spanned.')

        columns = as_java_array(self.ctx._gateway, "String",
                                (str(c) for c in self.columns))
        jrdd = self._helper.spanBy(self._cjrdd, columns)
        rdd = RDD(jrdd, self.ctx)

        global pd
        if index_by and pd:
            return rdd.mapValues(
                lambda _: _.set_index(*[str(c) for c in index_by]))
        else:
            return rdd
Пример #3
0
	def select(self, *columns):
		"""Creates a CassandraRDD with the select clause applied.""" 

		columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns))
		new = copy(self)
		new._jrdd = new._jrdd.select(columns)
		return new
Пример #4
0
	def select(self, *columns):
		"""Creates a CassandraRDD with the select clause applied.""" 

		new = copy(self)
		new._cjrdd = new._cjrdd.select(as_java_array(self.ctx._gateway, "String", (str(c) for c in columns)))
		new._jrdd = self._helper.parseRows(new._cjrdd, self.row_format)
		return new
Пример #5
0
 def where(self, clause, *args):
     """Creates a CassandraRDD with a CQL where clause applied.
     @param clause: The where clause, either complete or with ? markers
     @param *args: The parameters for the ? markers in the where clause.
     """
     args = as_java_array(self.ctx._gateway, "Object", args)
     return self._specialize('where', *[clause, args])
Пример #6
0
    def select(self, *columns):
        """Creates a CassandraRDD with the select clause applied."""

        columns = as_java_array(self.ctx._gateway, "String",
                                (str(c) for c in columns))
        new = copy(self)
        new._jrdd = new._jrdd.select(columns)
        return new
Пример #7
0
def saveToCassandra(rdd,
                    keyspace=None,
                    table=None,
                    columns=None,
                    write_conf=None,
                    row_format=None):
    '''
		Saves an RDD to Cassandra. The RDD is expected to contain dicts with keys mapping to CQL columns.
	
		Arguments:
		@param rdd(RDD):
			The RDD to save. Equals to self when invoking saveToCassandra on a monkey patched RDD.
		@param keyspace(string):in
			The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same keyspace is used.
		@param table(string):
			The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same table is used.
	
		Keyword arguments:
		@param columns(iterable):
			The columns to save, i.e. which keys to take from the dicts in the RDD.
			If None given all columns are be stored. 
		
		@param write_conf(WriteConf):
			A WriteConf object to use when saving to Cassandra
		
		@param row_format(RowFormat):
			Make explicit how to map the RDD elements into Cassandra rows.
			If None given the mapping is auto-detected as far as possible.
	'''

    keyspace = keyspace or rdd.keyspace
    if not keyspace:
        raise ValueError("keyspace not set")

    table = table or rdd.table
    if not table:
        raise ValueError("table not set")

    # create write config as map and convert the columns to a string array
    write_conf = as_java_object(rdd.ctx._gateway,
                                write_conf.__dict__) if write_conf else None
    columns = as_java_array(rdd.ctx._gateway, "String",
                            columns) if columns else None

    # create a helper object
    helper = rdd.ctx._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
     .loadClass("pyspark_cassandra.PythonHelper").newInstance()

    # delegate to helper
    helper \
     .saveToCassandra(
      rdd._jrdd,
      keyspace,
      table,
      columns,
      write_conf,
      row_format
     )
Пример #8
0
    def where(self, clause, *args):
        """Creates a CassandraRDD with a CQL where clause applied.
		@param clause: The where clause, either complete or with ? markers
		@param *args: The parameters for the ? markers in the where clause.
		"""

        args = as_java_array(self.ctx._gateway, "Object", args)
        new = copy(self)
        new._jrdd = new._jrdd.where(clause, args)
        return new
Пример #9
0
	def where(self, clause, *args):
		"""Creates a CassandraRDD with a CQL where clause applied.
		@param clause: The where clause, either complete or with ? markers
		@param *args: The parameters for the ? markers in the where clause.
		"""

		args = as_java_array(self.ctx._gateway, "Object", args)
		new = copy(self)
		new._jrdd = new._jrdd.where(clause, args)
		return new
Пример #10
0
def saveToCassandra(rdd, keyspace=None, table=None, columns=None, row_format=None, keyed=None, write_conf=None,
                    **write_conf_kwargs):
    '''
        Saves an RDD to Cassandra. The RDD is expected to contain dicts with keys mapping to CQL columns.

        Arguments:
        @param rdd(RDD):
            The RDD to save. Equals to self when invoking saveToCassandra on a monkey patched RDD.
        @param keyspace(string):in
            The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same keyspace is used.
        @param table(string):
            The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same table is used.

        Keyword arguments:
        @param columns(iterable):
            The columns to save, i.e. which keys to take from the dicts in the RDD.
            If None given all columns are be stored.

        @param row_format(RowFormat):
            Make explicit how to map the RDD elements into Cassandra rows.
            If None given the mapping is auto-detected as far as possible.
        @param keyed(bool):
            Make explicit that the RDD consists of key, value tuples (and not arrays of length two).

        @param write_conf(WriteConf):
            A WriteConf object to use when saving to Cassandra
        @param **write_conf_kwargs:
            WriteConf parameters to use when saving to Cassandra
    '''

    keyspace = keyspace or getattr(rdd, 'keyspace', None)
    if not keyspace:
        raise ValueError("keyspace not set")

    table = table or getattr(rdd, 'table', None)
    if not table:
        raise ValueError("table not set")

    # create write config as map
    write_conf = WriteConf.build(write_conf, **write_conf_kwargs)
    write_conf = as_java_object(rdd.ctx._gateway, write_conf.settings())
    # convert the columns to a string array
    columns = as_java_array(rdd.ctx._gateway, "String", columns) if columns else None

    helper(rdd.ctx) \
        .saveToCassandra(
            rdd._jrdd,
            keyspace,
            table,
            columns,
            row_format,
            keyed,
            write_conf,
        )
Пример #11
0
def saveToCassandra(rdd, keyspace=None, table=None, columns=None, write_conf=None, row_format=None):
	'''
		Saves an RDD to Cassandra. The RDD is expected to contain dicts with keys mapping to CQL columns.
	
		Arguments:
		@param rdd(RDD):
			The RDD to save. Equals to self when invoking saveToCassandra on a monkey patched RDD.
		@param keyspace(string):in
			The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same keyspace is used.
		@param table(string):
			The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same table is used.
	
		Keyword arguments:
		@param columns(iterable):
			The columns to save, i.e. which keys to take from the dicts in the RDD.
			If None given all columns are be stored. 
		
		@param write_conf(WriteConf):
			A WriteConf object to use when saving to Cassandra
		
		@param row_format(RowFormat):
			Make explicit how to map the RDD elements into Cassandra rows.
			If None given the mapping is auto-detected as far as possible.
	'''
	
	keyspace = keyspace or rdd.keyspace
	if not keyspace:
		raise ValueError("keyspace not set")
	
	table = table or rdd.table
	if not table:
		raise ValueError("table not set")
	
	# create write config as map and convert the columns to a string array
	write_conf = as_java_object(rdd.ctx._gateway, write_conf.__dict__) if write_conf else None
	columns = as_java_array(rdd.ctx._gateway, "String", columns) if columns else None

	# create a helper object
	helper = rdd.ctx._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
		.loadClass("pyspark_cassandra.PythonHelper").newInstance()
		
	# delegate to helper
	helper \
		.saveToCassandra(
			rdd._jrdd,
			keyspace,
			table,
			columns,
			write_conf,
			row_format
		)
Пример #12
0
	def asDataFrames(self, *index_by):
		'''
			Reads the spanned rows as DataFrames if pandas is available, or as
			a dict of numpy arrays if only numpy is available or as a dict with
			primitives and objects otherwise.
			
			@param index_by If pandas is available, the dataframes will be
			indexed by the given columns.
		'''
		for c in index_by:
			if c in self.columns:
				raise ValueError('column %s cannot be used as index in the data'
					'frames as it is a column by which the rows are spanned.') 
		
		columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns))
		jrdd = self._helper.spanBy(self._cjrdd, columns)
		rdd = RDD(jrdd, self.ctx)
		
		global pd
		if index_by and pd:
			return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by]))
		else:
			return rdd
Пример #13
0
 def on(self, *columns):
     columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns))
     return self._specialize('on', columns)
Пример #14
0
 def select(self, *columns):
     """Creates a CassandraRDD with the select clause applied."""
     columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns))
     return self._specialize('select', columns)