示例#1
0
文件: frame.py 项目: acx2015/atk
 def get_row_count(self, frame, where):
     if not where:
         return self._get_frame_info(frame).row_count
     # slightly faster generator to only return a list of one item, since we're just counting rows
     # TODO - there's got to be a better way to do this with the RDDs, trick is with Python.
     def icountwhere(predicate, iterable):
        return ("[1]" for item in iterable if predicate(item))
     arguments = {'frame': frame.uri,
                  'udf': get_udf_arg(frame, where, icountwhere)}
     return executor.execute("frame/count_where", self, arguments)
示例#2
0
 def get_row_count(self, frame, where):
     if not where:
         return self._get_frame_info(frame).row_count
     # slightly faster generator to only return a list of one item, since we're just counting rows
     # TODO - there's got to be a better way to do this with the RDDs, trick is with Python.
     def icountwhere(predicate, iterable):
        return ("[1]" for item in iterable if predicate(item))
     arguments = {'frame': frame.uri,
                  'udf': get_udf_arg(frame, where, icountwhere)}
     return executor.execute("frame/count_where", self, arguments)
示例#3
0
文件: frame.py 项目: acx2015/atk
    def categorical_summary(self, frame, column_inputs):
        column_list_input = []
        for input in column_inputs:
            if isinstance(input, basestring):
                column_list_input.append({'column' : input})
            elif isinstance(input, tuple) and isinstance(input[0], basestring) and isinstance(input[1], dict):
                column_dict = {'column' : input[0]}
                column_dict.update(input[1])
                column_list_input.append(column_dict)
            else:
                raise TypeError('Column inputs should be specified as strings or 2-element Tuple consisting of column name as string and dictionary for additional parameters')

        arguments = {'frame': frame.uri,
                     'column_input': column_list_input}
        return executor.execute('frame/categorical_summary', self, arguments)
示例#4
0
    def categorical_summary(self, frame, column_inputs):
        column_list_input = []
        for input in column_inputs:
            if isinstance(input, basestring):
                column_list_input.append({'column' : input})
            elif isinstance(input, tuple) and isinstance(input[0], basestring) and isinstance(input[1], dict):
                column_dict = {'column' : input[0]}
                column_dict.update(input[1])
                column_list_input.append(column_dict)
            else:
                raise TypeError('Column inputs should be specified as strings or 2-element Tuple consisting of column name as string and dictionary for additional parameters')

        arguments = {'frame': frame.uri,
                     'column_input': column_list_input}
        return executor.execute('frame/categorical_summary', self, arguments)
示例#5
0
文件: h2omodels.py 项目: ashahba/atk
        def train(self, frame, value_column, observation_columns, num_trees=50, max_depth=20, num_bins=20, min_rows=10, feature_subset_category='auto', seed=None, sample_rate=None):
            """
            Build H2O Random Forests Regressor model using the observation columns and target column.

            H2O's implementation of distributed random forest is slow for large trees due to the
            overhead of shipping histograms across the network. This plugin runs H2O random forest
            in a single node for small datasets, and multiple nodes for large datasets. The Spark
            context is released to cleanly shutdown the H2O Sparkling Water context.

            :param frame: A frame to train the model on
            :type frame: Frame
            :param value_column: Column name containing the value for each observation
            :type value_column: unicode
            :param observation_columns: Column(s) containing the observations
            :type observation_columns: list
            :param num_trees: (default=50)  Number of trees in the random forest.
            :type num_trees: int32
            :param max_depth: (default=20)  Maximum depth of the tree.
            :type max_depth: int32
            :param num_bins: (default=20)  For numerical columns (real/int), build a histogram of (at least) this many bins.
            :type num_bins: int32
            :param min_rows: (default=10)  Minimum number of rows to assign to terminal nodes.
            :type min_rows: int32
            :param feature_subset_category: (default=auto)  Number of features to consider for splits at each node. Supported values "auto", "all", "sqrt", "onethird".
                If "auto" is set, this is based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1, set to "onethird".
            :type feature_subset_category: unicode
            :param seed: (default=None)  Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded.
            :type seed: int32
            :param sample_rate: (default=None)  Row sample rate per tree (from 0.0 to 1.0).
            :type sample_rate: float64

            :returns: object
                      An object with the results of the trained Random Forest Regressor:
                      'value_column': the column name containing the value of each observation,
                      'observation_columns': the list of observation columns on which the model was trained,
                      'num_trees': the number of decision trees in the random forest,
                      'max_depth': the maximum depth of the tree,
                      'num_bins': for numerical columns, build a histogram of at least this many bins
                      'min_rows': number of features to consider for splits at each node
                      'feature_subset_category': number of features to consider for splits at each node,
                      'tree_stats': dictionary with tree statistics for trained model,
                      'varimp': variable importances

            :rtype: object
            """
            from trustedanalytics.rest.command import CommandRequest, executor
            from trustedanalytics.core.admin import release
            from trustedanalytics.core.h2omodels import H2oRandomForestRegressorTrainResult
            arguments = {'model' : self.uri,
                         'frame' : frame.uri,
                         'value_column': value_column,
                         'observation_columns': observation_columns,
                         'num_trees': num_trees,
                         'max_depth': max_depth,
                         'num_bins': num_bins,
                         'min_rows': min_rows,
                         'feature_subset_category': feature_subset_category,
                         'seed': seed,
                         'sample_rate': sample_rate}
            release()
            print "Training H2O random forest regression model..."
            try:
                if frame.row_count < 100000:
                    result = executor.execute("model:h2o_random_forest_regressor_private/_local_train", baseclass, arguments)
                else:
                    result = executor.execute("model:h2o_random_forest_regressor_private/_distributed_train", baseclass, arguments)
            finally:
                release()
            return H2oRandomForestRegressorTrainResult(result)
示例#6
0
文件: graph.py 项目: xoltar/atk
 def get_edge_count(self, graph):
     arguments = {'graph': self.get_ia_uri(graph)}
     return executor.execute("graph:/edge_count", graph, arguments)
示例#7
0
文件: graph.py 项目: rainiraj/atk
 def get_edge_count(self, graph):
     arguments = {'graph': graph.uri}
     return executor.execute("graph:/edge_count", graph, arguments)
示例#8
0
 def get_edge_count(self, graph):
     arguments = {'graph': graph.uri}
     return executor.execute("graph:/edge_count", graph, arguments)
示例#9
0
        def train(self,
                  frame,
                  value_column,
                  observation_columns,
                  num_trees=50,
                  max_depth=20,
                  num_bins=20,
                  min_rows=10,
                  feature_subset_category='auto',
                  seed=None,
                  sample_rate=None):
            """
            Build H2O Random Forests Regressor model using the observation columns and target column.

            H2O's implementation of distributed random forest is slow for large trees due to the
            overhead of shipping histograms across the network. This plugin runs H2O random forest
            in a single node for small datasets, and multiple nodes for large datasets. The Spark
            context is released to cleanly shutdown the H2O Sparkling Water context.

            :param frame: A frame to train the model on
            :type frame: Frame
            :param value_column: Column name containing the value for each observation
            :type value_column: unicode
            :param observation_columns: Column(s) containing the observations
            :type observation_columns: list
            :param num_trees: (default=50)  Number of trees in the random forest.
            :type num_trees: int32
            :param max_depth: (default=20)  Maximum depth of the tree.
            :type max_depth: int32
            :param num_bins: (default=20)  For numerical columns (real/int), build a histogram of (at least) this many bins.
            :type num_bins: int32
            :param min_rows: (default=10)  Minimum number of rows to assign to terminal nodes.
            :type min_rows: int32
            :param feature_subset_category: (default=auto)  Number of features to consider for splits at each node. Supported values "auto", "all", "sqrt", "onethird".
                If "auto" is set, this is based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1, set to "onethird".
            :type feature_subset_category: unicode
            :param seed: (default=None)  Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded.
            :type seed: int32
            :param sample_rate: (default=None)  Row sample rate per tree (from 0.0 to 1.0).
            :type sample_rate: float64

            :returns: object
                      An object with the results of the trained Random Forest Regressor:
                      'value_column': the column name containing the value of each observation,
                      'observation_columns': the list of observation columns on which the model was trained,
                      'num_trees': the number of decision trees in the random forest,
                      'max_depth': the maximum depth of the tree,
                      'num_bins': for numerical columns, build a histogram of at least this many bins
                      'min_rows': number of features to consider for splits at each node
                      'feature_subset_category': number of features to consider for splits at each node,
                      'tree_stats': dictionary with tree statistics for trained model,
                      'varimp': variable importances

            :rtype: object
            """
            from trustedanalytics.rest.command import CommandRequest, executor
            from trustedanalytics.core.admin import release
            from trustedanalytics.core.h2omodels import H2oRandomForestRegressorTrainResult
            arguments = {
                'model': self.uri,
                'frame': frame.uri,
                'value_column': value_column,
                'observation_columns': observation_columns,
                'num_trees': num_trees,
                'max_depth': max_depth,
                'num_bins': num_bins,
                'min_rows': min_rows,
                'feature_subset_category': feature_subset_category,
                'seed': seed,
                'sample_rate': sample_rate
            }
            release()
            print "Training H2O random forest regression model..."
            try:
                if frame.row_count < 100000:
                    result = executor.execute(
                        "model:h2o_random_forest_regressor_private/_local_train",
                        baseclass, arguments)
                else:
                    result = executor.execute(
                        "model:h2o_random_forest_regressor_private/_distributed_train",
                        baseclass, arguments)
            finally:
                release()
            return H2oRandomForestRegressorTrainResult(result)
示例#10
0
文件: graph.py 项目: tgctaka/atk
 def get_vertex_count(self, graph):
     arguments = {"graph": graph.uri}
     return executor.execute("graph:/vertex_count", graph, arguments)
示例#11
0
文件: graph.py 项目: anjalisood/atk
 def get_vertex_count(self, graph):
     arguments = {'graph': graph.uri}
     return executor.execute("graph:/vertex_count", graph, arguments)['value']
示例#12
0
 def _import_orientdb(self, graph, source):
     arguments = source.to_json()
     arguments['graph'] = graph.uri
     return executor.execute("graph:/_import_orientdb", graph, arguments)
示例#13
0
文件: graph.py 项目: jitendra42/atk
 def _import_orientdb(self, graph, source):
     arguments = source.to_json()
     arguments['graph'] = graph.uri
     return executor.execute("graph:/_import_orientdb", graph, arguments)
示例#14
0
文件: graph.py 项目: xoltar/atk
 def get_edge_count(self, graph):
     arguments = {'graph': self.get_ia_uri(graph)}
     return executor.execute("graph:/edge_count", graph, arguments)