Пример #1
0
def joinWithCassandraTable(dstream, keyspace, table, selected_columns=None, join_columns=None):
    """Joins a DStream (a stream of RDDs) with a Cassandra table

    Arguments:
        @param dstream(DStream)
        The DStream to join. Equals to self when invoking joinWithCassandraTable on a monkey
        patched RDD.
        @param keyspace(string):
            The keyspace to join on.
        @param table(string):
            The CQL table to join on.
        @param selected_columns(string):
            The columns to select from the Cassandra table.
        @param join_columns(string):
            The columns used to join on from the Cassandra table.
    """

    ssc = dstream._ssc
    ctx = ssc._sc
    gw = ctx._gateway

    selected_columns = as_java_array(gw, "String", selected_columns) if selected_columns else None
    join_columns = as_java_array(gw, "String", join_columns) if join_columns else None

    h = helper(ctx)
    dstream = h.joinWithCassandraTable(dstream._jdstream, keyspace, table, selected_columns,
                                       join_columns)
    dstream = h.pickleRows(dstream)
    dstream = h.javaDStream(dstream)

    return DStream(dstream, ssc, AutoBatchedSerializer(PickleSerializer()))
Пример #2
0
def joinWithCassandraTable(dstream, keyspace, table, selected_columns=None,
                           join_columns=None):
    """Joins a DStream (a stream of RDDs) with a Cassandra table

    Arguments:
        @param dstream(DStream)
        The DStream to join. Equals to self when invoking
        joinWithCassandraTable on a monkey patched RDD.
        @param keyspace(string):
            The keyspace to join on.
        @param table(string):
            The CQL table to join on.
        @param selected_columns(string):
            The columns to select from the Cassandra table.
        @param join_columns(string):
            The columns used to join on from the Cassandra table.
    """

    ssc = dstream._ssc
    ctx = ssc._sc
    gw = ctx._gateway

    selected_columns = as_java_array(
        gw, "String", selected_columns) if selected_columns else None
    join_columns = as_java_array(gw, "String",
                                 join_columns) if join_columns else None

    h = helper(ctx)
    dstream = h.joinWithCassandraTable(dstream._jdstream, keyspace, table,
                                       selected_columns,
                                       join_columns)
    dstream = h.pickleRows(dstream)
    dstream = h.javaDStream(dstream)

    return DStream(dstream, ssc, AutoBatchedSerializer(PickleSerializer()))
Пример #3
0
def deleteFromCassandra(dstream, keyspace=None, table=None, deleteColumns=None,
                        keyColumns=None,
                        row_format=None, keyed=None, write_conf=None,
                        **write_conf_kwargs):
    """Delete data from Cassandra table, using data from the RDD as primary
    keys. Uses the specified column names.

    Arguments:
       @param dstream(DStream)
        The DStream to join. Equals to self when invoking
        joinWithCassandraTable on a monkey patched RDD.
        @param keyspace(string):in
            The keyspace to save the RDD in. If not given and the rdd is a
            CassandraRDD the same keyspace is used.
        @param table(string):
            The CQL table to save the RDD in. If not given and the rdd is a
            CassandraRDD the same table is used.

        Keyword arguments:
        @param deleteColumns(iterable):
            The list of column names to delete, empty ColumnSelector means full
            row.

        @param keyColumns(iterable):
            The list of column names to delete, empty ColumnSelector means full
            row.

        @param row_format(RowFormat):
            Primary key columns selector, Optional. All RDD primary columns
            columns will be checked by default
        @param keyed(bool):
            Make explicit that the RDD consists of key, value tuples (and not
            arrays of length two).

        @param write_conf(WriteConf):
            A WriteConf object to use when saving to Cassandra
        @param **write_conf_kwargs:
            WriteConf parameters to use when saving to Cassandra
    """

    ctx = dstream._ssc._sc
    gw = ctx._gateway

    # create write config as map
    write_conf = WriteConf.build(write_conf, **write_conf_kwargs)
    write_conf = as_java_object(gw, write_conf.settings())
    # convert the columns to a string array
    deleteColumns = as_java_array(gw, "String",
                                  deleteColumns) if deleteColumns else None
    keyColumns = as_java_array(gw, "String", keyColumns) \
        if keyColumns else None

    return helper(ctx).deleteFromCassandra(dstream._jdstream, keyspace, table,
                                           deleteColumns, keyColumns,
                                           row_format,
                                           keyed, write_conf)
Пример #4
0
    def asDataFrames(self, *index_by):
        '''
            Reads the spanned rows as DataFrames if pandas is available, or as
            a dict of numpy arrays if only numpy is available or as a dict with
            primitives and objects otherwise.

            @param index_by If pandas is available, the dataframes will be
            indexed by the given columns.
        '''
        for c in index_by:
            if c in self.columns:
                raise ValueError(
                    'column %s cannot be used as index in the data'
                    'frames as it is a column by which the rows are spanned.')

        columns = as_java_array(self.ctx._gateway, "String",
                                (str(c) for c in self.columns))
        jrdd = self._helper.spanBy(self._crdd, columns)
        rdd = RDD(jrdd, self.ctx)

        global pd
        if index_by and pd:
            return rdd.mapValues(
                lambda _: _.set_index(*[str(c) for c in index_by]))
        else:
            return rdd
Пример #5
0
 def where(self, clause, *args):
     """Creates a CassandraRDD with a CQL where clause applied.
     @param clause: The where clause, either complete or with ? markers
     @param *args: The parameters for the ? markers in the where clause.
     """
     args = as_java_array(self.ctx._gateway, "Object", args)
     return self._specialize('where', *[clause, args])
Пример #6
0
 def where(self, clause, *args):
     """Creates a CassandraRDD with a CQL where clause applied.
     @param clause: The where clause, either complete or with ? markers
     @param *args: The parameters for the ? markers in the where clause.
     """
     args = as_java_array(self.ctx._gateway, "Object", args)
     return self._specialize('where', *[clause, args])
Пример #7
0
def saveToCassandra(rdd, keyspace=None, table=None, columns=None, row_format=None, keyed=None,
                    write_conf=None, **write_conf_kwargs):
    '''
        Saves an RDD to Cassandra. The RDD is expected to contain dicts with keys mapping to CQL
        columns.

        Arguments:
        @param rdd(RDD):
            The RDD to save. Equals to self when invoking saveToCassandra on a monkey patched RDD.
        @param keyspace(string):in
            The keyspace to save the RDD in. If not given and the rdd is a CassandraRDD the same
            keyspace is used.
        @param table(string):
            The CQL table to save the RDD in. If not given and the rdd is a CassandraRDD the same
            table is used.

        Keyword arguments:
        @param columns(iterable):
            The columns to save, i.e. which keys to take from the dicts in the RDD.
            If None given all columns are be stored.

        @param row_format(RowFormat):
            Make explicit how to map the RDD elements into Cassandra rows.
            If None given the mapping is auto-detected as far as possible.
        @param keyed(bool):
            Make explicit that the RDD consists of key, value tuples (and not arrays of length
            two).

        @param write_conf(WriteConf):
            A WriteConf object to use when saving to Cassandra
        @param **write_conf_kwargs:
            WriteConf parameters to use when saving to Cassandra
    '''

    keyspace = keyspace or getattr(rdd, 'keyspace', None)
    if not keyspace:
        raise ValueError("keyspace not set")

    table = table or getattr(rdd, 'table', None)
    if not table:
        raise ValueError("table not set")

    # create write config as map
    write_conf = WriteConf.build(write_conf, **write_conf_kwargs)
    write_conf = as_java_object(rdd.ctx._gateway, write_conf.settings())
    # convert the columns to a string array
    columns = as_java_array(rdd.ctx._gateway, "String", columns) if columns else None

    helper(rdd.ctx) \
        .saveToCassandra(
            rdd._jrdd,
            keyspace,
            table,
            columns,
            row_format,
            keyed,
            write_conf,
        )
Пример #8
0
def saveToCassandra(dstream, keyspace, table, columns=None, row_format=None, keyed=None, write_conf=None,
                    **write_conf_kwargs):
    ctx = dstream._ssc._sc
    gw = ctx._gateway

    # create write config as map
    write_conf = WriteConf.build(write_conf, **write_conf_kwargs)
    write_conf = as_java_object(gw, write_conf.settings())
    # convert the columns to a string array
    columns = as_java_array(gw, "String", columns) if columns else None

    return helper(ctx).saveToCassandra(dstream._jdstream, keyspace, table, columns, row_format, keyed, write_conf)
Пример #9
0
def saveToCassandra(dstream, keyspace, table, columns=None, row_format=None,
                    keyed=None,
                    write_conf=None, **write_conf_kwargs):
    ctx = dstream._ssc._sc
    gw = ctx._gateway

    # create write config as map
    write_conf = WriteConf.build(write_conf, **write_conf_kwargs)
    write_conf = as_java_object(gw, write_conf.settings())
    # convert the columns to a string array
    columns = as_java_array(gw, "String", columns) if columns else None

    return helper(ctx).saveToCassandra(dstream._jdstream, keyspace, table,
                                       columns, row_format,
                                       keyed, write_conf)
Пример #10
0
    def asDataFrames(self, *index_by):
        '''
            Reads the spanned rows as DataFrames if pandas is available, or as
            a dict of numpy arrays if only numpy is available or as a dict with
            primitives and objects otherwise.

            @param index_by If pandas is available, the dataframes will be
            indexed by the given columns.
        '''
        for c in index_by:
            if c in self.columns:
                raise ValueError('column %s cannot be used as index in the data'
                    'frames as it is a column by which the rows are spanned.')

        columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in self.columns))
        jrdd = self._helper.spanBy(self._crdd, columns)
        rdd = RDD(jrdd, self.ctx)

        global pd
        if index_by and pd:
            return rdd.mapValues(lambda _: _.set_index(*[str(c) for c in index_by]))
        else:
            return rdd
Пример #11
0
 def on(self, *columns):
     columns = as_java_array(self.ctx._gateway, "String",
                             (str(c) for c in columns))
     return self._specialize('on', columns)
Пример #12
0
 def select(self, *columns):
     """Creates a CassandraRDD with the select clause applied."""
     columns = as_java_array(self.ctx._gateway, "String",
                             (str(c) for c in columns))
     return self._specialize('select', columns)
Пример #13
0
def deleteFromCassandra(rdd,
                        keyspace=None,
                        table=None,
                        deleteColumns=None,
                        keyColumns=None,
                        row_format=None,
                        keyed=None,
                        write_conf=None,
                        **write_conf_kwargs):
    """
        Delete data from Cassandra table, using data from the RDD as primary
        keys. Uses the specified column names.

        Arguments:
        @param rdd(RDD):
            The RDD to save. Equals to self when invoking saveToCassandra on a
            monkey patched RDD.
        @param keyspace(string):in
            The keyspace to save the RDD in. If not given and the rdd is a
            CassandraRDD the same keyspace is used.
        @param table(string):
            The CQL table to save the RDD in. If not given and the rdd is a
            CassandraRDD the same table is used.

        Keyword arguments:
        @param deleteColumns(iterable):
            The list of column names to delete, empty ColumnSelector means full
            row.

        @param keyColumns(iterable):
            The list of column names to delete, empty ColumnSelector means full
            row.

        @param row_format(RowFormat):
            Primary key columns selector, Optional. All RDD primary columns
            columns will be checked by default
        @param keyed(bool):
            Make explicit that the RDD consists of key, value tuples (and not
            arrays of length two).

        @param write_conf(WriteConf):
            A WriteConf object to use when saving to Cassandra
        @param **write_conf_kwargs:
            WriteConf parameters to use when saving to Cassandra
    """

    keyspace = keyspace or getattr(rdd, 'keyspace', None)
    if not keyspace:
        raise ValueError("keyspace not set")

    table = table or getattr(rdd, 'table', None)
    if not table:
        raise ValueError("table not set")

    # create write config as map
    write_conf = WriteConf.build(write_conf, **write_conf_kwargs)
    write_conf = as_java_object(rdd.ctx._gateway, write_conf.settings())

    # convert the columns to a string array
    deleteColumns = as_java_array(rdd.ctx._gateway, "String", deleteColumns) \
        if deleteColumns else None
    keyColumns = as_java_array(rdd.ctx._gateway, "String", keyColumns) \
        if keyColumns else None

    helper(rdd.ctx) \
        .deleteFromCassandra(
        rdd._jrdd,
        keyspace,
        table,
        deleteColumns,
        keyColumns,
        row_format,
        keyed,
        write_conf,
    )
Пример #14
0
 def on(self, *columns):
     columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns))
     return self._specialize('on', columns)
Пример #15
0
 def select(self, *columns):
     """Creates a CassandraRDD with the select clause applied."""
     columns = as_java_array(self.ctx._gateway, "String", (str(c) for c in columns))
     return self._specialize('select', columns)