Пример #1
0
    def recommend(self,
                  entity_id,
                  number_of_recommendations=1,
                  recommend_products=True):
        """
        recommend products to users or vice versa

        :param entity_id: (int) A user/product id
        :param number_of_recommendations: (int) Number of recommendations
        :param recommend_products: (bool) True - products for user; false - users for the product
        :return: Returns an array of recommendations (as array of csv-strings)
        """
        require_type(int, entity_id, "entity_id")
        require_type.non_negative_int(number_of_recommendations,
                                      "number_of_recommendations")
        require_type(bool, recommend_products, "recommend_products")

        # returns scala list of scala map
        scala_list_of_scala_map = self._scala.recommend(
            entity_id, number_of_recommendations, recommend_products)

        # First convert to python list of scala map
        python_list_of_scala_map = self._tc.jutils.convert.from_scala_seq(
            scala_list_of_scala_map)

        # Convert to Python list of python map
        python_list_of_python_map = []
        for scala_map in python_list_of_scala_map:
            python_list_of_python_map.append(
                self._tc.jutils.convert.scala_map_to_python(scala_map))

        return python_list_of_python_map
Пример #2
0
def import_dcm(dicom_dir_path, min_partitions=2, tc=TkContext.implicit):
    """
    Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s)

    Parameters
    ----------

    :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s)
    :param min_partitions: (int) Minimum no.of HDFS partitions to use for import dcm
    :return: (Dicom) returns a dicom object with metadata and pixeldata frames


    Examples
    --------
        #Path can be local/hdfs to dcm file(s)
        >>> dicom_path = "../datasets/dicom_uncompressed"

        #use import_dcm available inside dicom module to create a dicom object from given dicom_path
        >>> dicom = tc.dicom.import_dcm(dicom_path)

        #Type of dicom object created
        >>> type(dicom)
        <class 'sparktk.dicom.dicom.Dicom'>

        #Inspect metadata property to see dicom metadata xml content
        <skip>
        >>> dicom.metadata.inspect(truncate=30)
        [#]  id  metadata
        =======================================
        [0]   0  <?xml version="1.0" encodin...
        [1]   1  <?xml version="1.0" encodin...
        [2]   2  <?xml version="1.0" encodin...
        </skip>

        #pixeldata property is sparktk frame
        >>> pixeldata = dicom.pixeldata.take(1)

        <skip>

        >>> pixeldata
        [[0L, array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  7.,  5., ...,  5.,  7.,  8.],
        [ 0.,  7.,  6., ...,  5.,  6.,  7.],
        ...,
        [ 0.,  6.,  7., ...,  5.,  5.,  6.],
        [ 0.,  2.,  5., ...,  5.,  5.,  4.],
        [ 1.,  1.,  3., ...,  1.,  1.,  0.]])]]
        </skip>

    """

    require_type.non_empty_str(dicom_dir_path, "dicom_dir_path")
    require_type.non_negative_int(min_partitions, "min_partitions")

    TkContext.validate(tc)

    scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm(
        tc.jutils.get_scala_sc(), dicom_dir_path, min_partitions)
    from sparktk.dicom.dicom import Dicom
    return Dicom._from_scala(tc, scala_dicom)
Пример #3
0
    def recommend(self, entity_id, number_of_recommendations=1, recommend_products=True):
        """
        recommend products to users or vice versa

        :param entity_id: (int) A user/product id
        :param number_of_recommendations: (int) Number of recommendations
        :param recommend_products: (bool) True - products for user; false - users for the product
        :return: Returns an array of recommendations (as array of csv-strings)
        """
        require_type(int, entity_id, "entity_id")
        require_type.non_negative_int(number_of_recommendations, "number_of_recommendations")
        require_type(bool, recommend_products, "recommend_products")

        # returns scala list of scala map
        scala_list_of_scala_map = self._scala.recommend(entity_id, number_of_recommendations, recommend_products)

        # First convert to python list of scala map
        python_list_of_scala_map = self._tc.jutils.convert.from_scala_seq(scala_list_of_scala_map)

        # Convert to Python list of python map
        python_list_of_python_map = []
        for scala_map in python_list_of_scala_map:
            python_list_of_python_map.append(self._tc.jutils.convert.scala_map_to_python(scala_map))

        return python_list_of_python_map
Пример #4
0
def import_dcm(dicom_dir_path, min_partitions=2, tc=TkContext.implicit):
    """
    Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s)

    Parameters
    ----------

    :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s)
    :param min_partitions: (int) Minimum no.of HDFS partitions to use for import dcm
    :return: (Dicom) returns a dicom object with metadata and pixeldata frames


    Examples
    --------
        #Path can be local/hdfs to dcm file(s)
        >>> dicom_path = "../datasets/dicom_uncompressed"

        #use import_dcm available inside dicom module to create a dicom object from given dicom_path
        >>> dicom = tc.dicom.import_dcm(dicom_path)

        #Type of dicom object created
        >>> type(dicom)
        <class 'sparktk.dicom.dicom.Dicom'>

        #Inspect metadata property to see dicom metadata xml content
        <skip>
        >>> dicom.metadata.inspect(truncate=30)
        [#]  id  metadata
        =======================================
        [0]   0  <?xml version="1.0" encodin...
        [1]   1  <?xml version="1.0" encodin...
        [2]   2  <?xml version="1.0" encodin...
        </skip>

        #pixeldata property is sparktk frame
        >>> pixeldata = dicom.pixeldata.take(1)

        <skip>

        >>> pixeldata
        [[0L, array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  7.,  5., ...,  5.,  7.,  8.],
        [ 0.,  7.,  6., ...,  5.,  6.,  7.],
        ...,
        [ 0.,  6.,  7., ...,  5.,  5.,  6.],
        [ 0.,  2.,  5., ...,  5.,  5.,  4.],
        [ 1.,  1.,  3., ...,  1.,  1.,  0.]])]]
        </skip>

    """

    require_type.non_empty_str(dicom_dir_path, "dicom_dir_path")
    require_type.non_negative_int(min_partitions, "min_partitions")

    TkContext.validate(tc)

    scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm(tc.jutils.get_scala_sc(), dicom_dir_path, min_partitions)
    from sparktk.dicom.dicom import Dicom
    return Dicom._from_scala(tc, scala_dicom)
Пример #5
0
 def test_non_negative_int_type_error(self):
     try:
         require_type.non_negative_int("12", "a")
     except TypeError as e:
         msg = str(e)
         expected = "Expected type <type 'int'>"
         self.assertTrue(expected in msg, "\nexpected=%s\nmessage =%s" % (expected, msg))
     else:
         self.fail("A TypeError should have been raised")
Пример #6
0
 def test_non_negative_int_value_error(self):
     try:
         require_type.non_negative_int(-1, "a")
     except ValueError as e:
         msg = str(e)
         expected = "Expected non-negative integer"
         self.assertTrue(expected in msg, "\nexpected=%s\nmessage =%s" % (expected, msg))
     else:
         self.fail("A ValueError should have been raised")
 def test_non_negative_int_type_error(self):
     try:
         require_type.non_negative_int("12", "a")
     except TypeError as e:
         msg = str(e)
         expected = "Expected type <type 'int'>"
         self.assertTrue(expected in msg,
                         "\nexpected=%s\nmessage =%s" % (expected, msg))
     else:
         self.fail("A TypeError should have been raised")
 def test_non_negative_int_value_error(self):
     try:
         require_type.non_negative_int(-1, "a")
     except ValueError as e:
         msg = str(e)
         expected = "Expected non-negative integer"
         self.assertTrue(expected in msg,
                         "\nexpected=%s\nmessage =%s" % (expected, msg))
     else:
         self.fail("A ValueError should have been raised")
Пример #9
0
def train(frame,
          time_column,
          covariate_columns,
          censor_column,
          convergence_tolerance=1E-6,
          max_steps=100):
    """
    Creates a CoxProportionalHazardsModel by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) A frame to train the model on
    :param time_column: (str) Column name containing the time of occurence of each observation.
    :param covariate_columns: (Seq[str]) List of column(s) containing the covariates.
    :param censor_column: (str) Column name containing censor value of each observation.
    :param convergence_tolerance: (float) Parameter for the convergence tolerance for iterative algorithms. Default is 1E-6
    :param max_steps: (int) Parameter for maximum number of steps. Default is 100
    :return: (CoxProportionalHazardsModel) A trained coxPh model
    """
    from sparktk.frame.frame import Frame
    require_type(Frame, frame, "frame cannot be None")
    require_type.non_empty_str(time_column, "time_column")
    require_type.non_empty_str(censor_column, "censor_column")
    require_type(float, convergence_tolerance, "convergence_tolerance should be float")
    require_type.non_negative_int(max_steps, "max_steps")
    affirm_type.list_of_str(covariate_columns, "covariate_columns")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_covariate_columns = tc.jutils.convert.to_scala_vector_string(covariate_columns)

    scala_model = _scala_obj.train(frame._scala,
                                   time_column,
                                   scala_covariate_columns,
                                   censor_column,
                                   convergence_tolerance,
                                   max_steps)
    return CoxProportionalHazardsModel(tc, scala_model)
Пример #10
0
def take(self, n, offset=0, columns=None):
    """
    Get data subset.

    Take a subset of the currently active Frame.

    (See 'collect' operation to simply get all the data from the Frame)

    Parameters
    ----------

    :param n: (int) The number of rows to get from the frame (warning: do not overwhelm the python session
                    by taking too much)
    :param offset: (Optional[int]) The number of rows to skip before starting to copy.
    :param columns: (Optional[str or list[str]) If not None, only the given columns' data will be provided.
                    By default, all columns are included.
    :return: (list[list[data]]) raw frame data

    Examples
    --------

    <hide>
        >>> schema = [('name',str), ('age', int), ('tenure', int), ('phone', str)]
        >>> rows = [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
        >>> frame = tc.frame.create(rows, schema)
        -etc-
    </hide>

    Consider the following frame:
        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

    Use take to get the first two rows and look at the schema and data in the result:

        >>> frame.take(2)
        [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202']]

    Limit the columns in our result to just the name and age column:

        >>> frame.take(2, columns=['name', 'age'])
        [['Fred', 39], ['Susan', 33]]

    <hide>
        >>> tmp = frame._scala  # flip over to scala and try
        >>> frame.take(2, columns=['name', 'age'])
        [[u'Fred', 39], [u'Susan', 33]]

    </hide>

    """
    require_type.non_negative_int(n, "n")
    require_type.non_negative_int(offset, "offset")
    if columns is not None:
        columns = affirm_type.list_of_str(columns, "columns")
        if not columns:
            return []

    if self._is_scala:
        scala_data = self._scala.take(n, offset, self._tc.jutils.convert.to_scala_option_list_string(columns))
        schema = get_schema_for_columns(self.schema, columns) if columns else self.schema
        data = TakeCollectHelper.scala_rows_to_python(self._tc, scala_data, schema)
    else:
        require_type.non_negative_int(n, "n")
        if offset:
            data = _take_offset(self, n, offset, columns)
        elif columns:
            select_columns = TakeCollectHelper.get_select_columns_function(self.schema, columns)
            data = self._python.rdd.map(select_columns).take(n)
        else:
            data = self._python.rdd.take(n)
    return data
 def test_non_negative_int(self):
     require_type.non_negative_int(1, "a")
Пример #12
0
 def test_non_negative_int(self):
     require_type.non_negative_int(1, "a")
Пример #13
0
def take(self, n, offset=0, columns=None):
    """
    Get data subset.

    Take a subset of the currently active Frame.

    (See 'collect' operation to simply get all the data from the Frame)

    Parameters
    ----------

    :param n: (int) The number of rows to get from the frame (warning: do not overwhelm the python session
                    by taking too much)
    :param offset: (Optional[int]) The number of rows to skip before starting to copy.
    :param columns: (Optional[str or list[str]) If not None, only the given columns' data will be provided.
                    By default, all columns are included.
    :return: (list[list[data]]) raw frame data

    Examples
    --------

    <hide>
        >>> schema = [('name',str), ('age', int), ('tenure', int), ('phone', str)]
        >>> rows = [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
        >>> frame = tc.frame.create(rows, schema)
        -etc-
    </hide>

    Consider the following frame:
        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

    Use take to get the first two rows and look at the schema and data in the result:

        >>> frame.take(2)
        [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202']]

    Limit the columns in our result to just the name and age column:

        >>> frame.take(2, columns=['name', 'age'])
        [['Fred', 39], ['Susan', 33]]

    <hide>
        >>> tmp = frame._scala  # flip over to scala and try
        >>> frame.take(2, columns=['name', 'age'])
        [[u'Fred', 39], [u'Susan', 33]]

    </hide>

    """
    require_type.non_negative_int(n, "n")
    require_type.non_negative_int(offset, "offset")
    if columns is not None:
        columns = affirm_type.list_of_str(columns, "columns")
        if not columns:
            return []

    if self._is_scala:
        scala_data = self._scala.take(
            n, offset,
            self._tc.jutils.convert.to_scala_option_list_string(columns))
        schema = get_schema_for_columns(self.schema,
                                        columns) if columns else self.schema
        data = TakeCollectHelper.scala_rows_to_python(self._tc, scala_data,
                                                      schema)
    else:
        require_type.non_negative_int(n, "n")
        if offset:
            data = _take_offset(self, n, offset, columns)
        elif columns:
            select_columns = TakeCollectHelper.get_select_columns_function(
                self.schema, columns)
            data = self._python.rdd.map(select_columns).take(n)
        else:
            data = self._python.rdd.take(n)
    return data
Пример #14
0
def train(frame,
          source_column_name,
          dest_column_name,
          weight_column_name,
          max_steps=10,
          regularization=0.5,
          alpha=0.5,
          num_factors=3,
          use_implicit=False,
          num_user_blocks=2,
          num_item_blocks=3,
          checkpoint_iterations=10,
          target_rmse=0.05):
    """
    Create collaborative filtering model by training on given frame

    Parameters
    ----------

    :param frame: (Frame) The frame containing the data to train on
    :param source_column_name: (str) source column name.
    :param dest_column_name: (str) destination column name.
    :param weight_column_name: (str) weight column name.
    :param max_steps: (int) max number of super-steps (max iterations) before the algorithm terminates. Default = 10
    :param regularization: (float) value between 0 .. 1
    :param alpha: (double) value between 0 .. 1
    :param num_factors: (int) number of the desired factors (rank)
    :param use_implicit: (bool) use implicit preference
    :param num_user_blocks: (int) number of user blocks
    :param num_item_blocks: (int) number of item blocks
    :param checkpoint_iterations: (int) Number of iterations between checkpoints
    :param target_rmse: (double) target RMSE
    :return: (CollaborativeFilteringModel) A trained collaborative filtering model
    """
    from sparktk.frame.frame import Frame
    require_type(Frame, frame, 'frame')
    require_type.non_empty_str(source_column_name, "source_column_name")
    require_type.non_empty_str(dest_column_name, "dest_column_name")
    require_type.non_empty_str(weight_column_name, "weight_column_name")
    require_type.non_negative_int(max_steps, "max_steps")
    require_type(float, regularization, "regularization")
    if regularization > 1 or regularization < 0:
        raise ValueError(
            "'regularization' parameter must have a value between 0 and 1")
    require_type(float, alpha, "alpha")
    if alpha > 1 or alpha < 0:
        raise ValueError("'alpha' parameter must have a value between 0 and 1")
    require_type.non_negative_int(num_factors, "num_factors")
    require_type(bool, use_implicit, "use_implicit")
    require_type.non_negative_int(num_user_blocks, "num_user_blocks")
    require_type.non_negative_int(num_item_blocks, "num_item_blocks")
    require_type.non_negative_int(checkpoint_iterations,
                                  "checkpoint_iterations")
    require_type(float, target_rmse, "target_rmse")
    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_model = _scala_obj.train(frame._scala, source_column_name,
                                   dest_column_name, weight_column_name,
                                   max_steps, regularization, alpha,
                                   num_factors, use_implicit, num_user_blocks,
                                   num_item_blocks, checkpoint_iterations,
                                   target_rmse)
    return CollaborativeFilteringModel(tc, scala_model)
Пример #15
0
def train(frame,
          source_column_name,
          dest_column_name,
          weight_column_name,
          max_steps=10,
          regularization=0.5,
          alpha=0.5,
          num_factors=3,
          use_implicit=False,
          num_user_blocks=2,
          num_item_blocks=3,
          checkpoint_iterations=10,
          target_rmse=0.05):
    """
    Create collaborative filtering model by training on given frame

    Parameters
    ----------

    :param frame: (Frame) The frame containing the data to train on
    :param source_column_name: (str) source column name.
    :param dest_column_name: (str) destination column name.
    :param weight_column_name: (str) weight column name.
    :param max_steps: (int) max number of super-steps (max iterations) before the algorithm terminates. Default = 10
    :param regularization: (float) value between 0 .. 1
    :param alpha: (double) value between 0 .. 1
    :param num_factors: (int) number of the desired factors (rank)
    :param use_implicit: (bool) use implicit preference
    :param num_user_blocks: (int) number of user blocks
    :param num_item_blocks: (int) number of item blocks
    :param checkpoint_iterations: (int) Number of iterations between checkpoints
    :param target_rmse: (double) target RMSE
    :return: (CollaborativeFilteringModel) A trained collaborative filtering model
    """
    from sparktk.frame.frame import Frame
    require_type(Frame, frame, 'frame')
    require_type.non_empty_str(source_column_name, "source_column_name")
    require_type.non_empty_str(dest_column_name, "dest_column_name")
    require_type.non_empty_str(weight_column_name, "weight_column_name")
    require_type.non_negative_int(max_steps, "max_steps")
    require_type(float, regularization, "regularization")
    if regularization > 1 or regularization < 0:
        raise ValueError("'regularization' parameter must have a value between 0 and 1")
    require_type(float, alpha, "alpha")
    if alpha > 1 or alpha < 0:
        raise ValueError("'alpha' parameter must have a value between 0 and 1")
    require_type.non_negative_int(num_factors, "num_factors")
    require_type(bool, use_implicit, "use_implicit")
    require_type.non_negative_int(num_user_blocks, "num_user_blocks")
    require_type.non_negative_int(num_item_blocks, "num_item_blocks")
    require_type.non_negative_int(checkpoint_iterations, "checkpoint_iterations")
    require_type(float, target_rmse, "target_rmse")
    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_model = _scala_obj.train(frame._scala,
                                   source_column_name,
                                   dest_column_name,
                                   weight_column_name,
                                   max_steps,
                                   regularization,
                                   alpha,
                                   num_factors,
                                   use_implicit,
                                   num_user_blocks,
                                   num_item_blocks,
                                   checkpoint_iterations,
                                   target_rmse)
    return CollaborativeFilteringModel(tc, scala_model)
Пример #16
0
def train(frame,
          observation_columns,
          label_column,
          num_trees = 1,
          impurity = "variance",
          max_depth = 4,
          max_bins = 100,
          min_instances_per_node = 1,
          sub_sampling_rate = 1.0,
          feature_subset_category = "auto",
          seed = None,
          categorical_features_info = None):
    """
    Creates a Random Forest Regressor Model by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) frame frame of training data
    :param observation_columns: (list(str)) Column(s) containing the observations
    :param label_column: (str) Column name containing the label for each observation
    :param num_trees: (int) Number of tress in the random forest. Default is 1
    :param impurity: (str) Criterion used for information gain calculation. Default value is "variance".
    :param max_depth: (int) Maximum depth of the tree. Default is 4
    :param max_bins: (int) Maximum number of bins used for splitting features.
    :param min_instances_per_node: (int) Minimum number of records each child node must have after a split.
    :param sub_sampling_rate: (double) Fraction between 0..1 of the training data used for learning each decision tree.
    :param feature_subset_category: (str) Subset of observation columns, i.e., features,
                                 to consider when looking for the best split.
                                 Supported values "auto","all","sqrt","log2","onethird".
                                 If "auto" is set, this is based on num_trees: if num_trees == 1, set to "all"
                                 ; if num_trees > 1, set to "sqrt".
    :param seed: (Optional(int)) Random seed for bootstrapping and choosing feature subsets. Default is a randomly chosen seed.
    :param categorical_features_info: (Optional(Dict(str:int))) Arity of categorical features. Entry (name-> k) indicates
                                      that feature 'name' is categorical with 'k' categories indexed from 0:{0,1,...,k-1}

    :return: (RandomForestRegressorModel) The trained random forest regressor model

    Notes
    -----
    Random Forest is a supervised ensemble learning algorithm used to perform regression. A Random Forest
    Regressor model is initialized, trained on columns of a frame, and used to predict the value of each
    observation in the frame. This model runs the Spark ML implementation of Random Forest. During training,
    the decision trees are trained in parallel. During prediction, the average over-all tree's predicted
    value is the predicted value of the random forest.

    """
    require_type(Frame, frame, 'frame')
    column_list = affirm_type.list_of_str(observation_columns, "observation_columns")
    require_type.non_empty_str(label_column, "label_column")
    require_type.non_negative_int(num_trees, "num_trees")
    require_type.non_empty_str(impurity, "impurity")
    require_type.non_negative_int(max_depth, "max_depth")
    require_type.non_negative_int(max_bins, "max_bins")
    require_type.non_negative_int(min_instances_per_node, "min_instances_per_node")
    require_type(float, sub_sampling_rate, "sub_sampling_rate")
    if sub_sampling_rate > 1 or sub_sampling_rate < 0:
        raise ValueError("'sub_sampling_rate' parameter must have a value between 0 and 1")
    require_type.non_empty_str(feature_subset_category, "feature_subset_category")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    seed = int(os.urandom(2).encode('hex'), 16) if seed is None else seed
    scala_model = _scala_obj.train(frame._scala,
                                   tc.jutils.convert.to_scala_list_string(column_list),
                                   label_column,
                                   num_trees,
                                   impurity,
                                   max_depth,
                                   max_bins,
                                   min_instances_per_node,
                                   sub_sampling_rate,
                                   feature_subset_category,
                                   seed,
                                   __get_categorical_features_info(tc, categorical_features_info))

    return RandomForestRegressorModel(tc, scala_model)