Пример #1
0
def binary_report(predictions, col_true='CLASS'):
    print('Binary classification results:')

    y_true = (predictions[col_true] == 'QSO')
    y_pred_proba = predictions['QSO_PHOTO']
    y_pred_binary = (predictions['CLASS_PHOTO'] == 'QSO')

    n_pos = y_pred_binary.sum()
    n_all = len(y_pred_binary)
    print('Predicted positives: {}/{} ({:.2f}%)'.format(
        n_pos, n_all, n_pos / n_all * 100))

    logloss, logloss_err = bootstrap_metric(log_loss, y_true, y_pred_proba)
    print('Logloss = {:.4f} ({:.4f})'.format(logloss, logloss_err))

    binary_metrics = OrderedDict([
        ('Accuracy', partial(bootstrap_metric, accuracy_score)),
        ('F1', partial(bootstrap_metric, f1_score)),
        ('Precision', partial(bootstrap_metric, precision_score)),
        ('Recall', partial(bootstrap_metric, recall_score)),
    ])
    for metric_name, metric_func in binary_metrics.items():
        score, score_err = metric_func(y_true, y_pred_binary)
        print('{} = {:.4f} ({:.4f})'.format(metric_name, score, score_err))

    # ROC AUC
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    print('ROC AUC = {:.4f}'.format(roc_auc))
    plot_roc_curve(fpr, tpr, roc_auc)

    # Precision - recall curve
    average_precision = average_precision_score(y_true, y_pred_proba)
    precision, recall = precision_score(y_true, y_pred_binary), recall_score(
        y_true, y_pred_binary)
    precisions, recalls, thresholds = precision_recall_curve(
        y_true, y_pred_proba)
    plot_precision_recall_curve(precisions, recalls, average_precision,
                                precision, recall)
Пример #2
0
def convert_to_layer_nodes(root):
    """
    At each level in the SPN rooted in the 'root' node, model all the nodes
    as a single layer-node.

    Args:
        root (Node): The root of the SPN graph.

    Returns:
        root (Node): The root of the SPN graph, with each layer modelled as a
                     single layer-node.
    """

    parents = defaultdict(list)
    depths = defaultdict(list)
    node_to_depth = OrderedDict()
    node_to_depth[root] = 1

    def get_parents(node):
        # Add to Parents dict
        if node.is_op:
            for i in node.inputs:
                if (i and  # Input not empty
                        not (i.is_param or i.is_var)):
                    parents[i.node].append(node)
                    node_to_depth[i.node] = node_to_depth[node] + 1

    def permute_inputs(input_values, input_sizes):
        # For a given list of inputs and their corresponding sizes, create a
        # nested-list of (input, index) pairs.
        # E.g: input_values = [(A, [2, 5]), (B, None)]
        #      input_sizes = [2, 3]
        #      inputs = [[('A', 2), ('A', 5)],
        #                [('B', 0), ('B', 1), ('B', 2)]]
        inputs = [
            list(product([inp.node], inp.indices)) if inp and inp.indices else
            list(product([inp.node], list(range(inp_size))))
            for inp, inp_size in zip(input_values, input_sizes)
        ]

        # For a given nested-list of (input, index) pairs, permute over the inputs
        # E.g: permuted_inputs = [('A', 2), ('B', 0),
        #                         ('A', 2), ('B', 1),
        #                         ('A', 2), ('B', 2),
        #                         ('A', 5), ('B', 0),
        #                         ('A', 5), ('B', 1),
        #                         ('A', 5), ('B', 2)]
        permuted_inputs = list(product(*[inps for inps in inputs]))
        return list(chain(*permuted_inputs))

    # Create a parents dictionary of the SPN graph
    traverse_graph(root, fun=get_parents, skip_params=True)

    # Create a depth dictionary of the SPN graph
    for key, value in node_to_depth.items():
        depths[value].append(key)
    spn_depth = len(depths)

    # Iterate through each depth of the SPN, starting from the deepest layer,
    # moving up to the root node
    for depth in range(spn_depth, 1, -1):
        if isinstance(depths[depth][0], (Sum, ParallelSums)):  # A Sums Layer
            # Create a default SumsLayer node
            with tf.name_scope("Layer%s" % depth):
                sums_layer = SumsLayer(name="SumsLayer-%s.%s" % (depth, 1))
            # Initialize a counter for keeping track of number of sums
            # modelled in the layer node
            layer_num_sums = 0
            # Initialize an empty list for storing sum-input-sizes of sums
            # modelled in the layer node
            num_or_size_sums = []
            # Iterate through each node at the current depth of the SPN
            for node in depths[depth]:
                # TODO: To be replaced with node.num_sums once AbstractSums
                # class is introduced
                # No. of sums modelled by the current node
                node_num_sums = (1 if isinstance(node, Sum) else node.num_sums)
                # Add Input values of the current node to the SumsLayer node
                sums_layer.add_values(*node.values * node_num_sums)
                # Add sum-input-size, of each sum modelled in the current node,
                # to the list
                num_or_size_sums += [sum(node.get_input_sizes()[2:])
                                     ] * node_num_sums
                # Visit each parent of the current node
                for parent in parents[node]:
                    try:
                        # 'Values' in case parent is an Op node
                        values = list(parent.values)
                    except AttributeError:
                        # 'Inputs' in case parent is a Concat node
                        values = list(parent.inputs)
                    # Iterate through each input value of the current parent node
                    for i, value in enumerate(values):
                        # If the value is the current node
                        if value.node == node:
                            # Check if it has indices
                            if value.indices is not None:
                                # If so, then just add the num-sums of the
                                # layer-op as offset
                                indices = (np.asarray(value.indices) +
                                           layer_num_sums).tolist()
                            else:
                                # If not, then create a list accrodingly
                                indices = list(
                                    range(layer_num_sums,
                                          (layer_num_sums + node_num_sums)))
                            # Replace previous (node) Input value in the
                            # current parent node, with the new layer-node value
                            values[i] = (sums_layer, indices)
                            break  # Once child-node found, don't have to search further
                    # Reset values of the current parent node, by including
                    # the new child (Layer-node)
                    try:
                        # set 'values' in case parent is an Op node
                        parent.set_values(*values)
                    except AttributeError:
                        # set 'inputs' in case parent is a Concat node
                        parent.set_inputs(*values)
                # Increment num-sums-counter of the layer-node
                layer_num_sums += node_num_sums
                # Disconnect
                node.disconnect_inputs()

            # After all nodes at a certain depth are modelled into a Layer-node,
            # set num-sums parameter accordingly
            sums_layer.set_sum_sizes(num_or_size_sums)
        elif isinstance(depths[depth][0],
                        (Product, PermuteProducts)):  # A Products Layer
            with tf.name_scope("Layer%s" % depth):
                prods_layer = ProductsLayer(name="ProductsLayer-%s.%s" %
                                            (depth, 1))
            # Initialize a counter for keeping track of number of prods
            # modelled in the layer node
            layer_num_prods = 0
            # Initialize an empty list for storing prod-input-sizes of prods
            # modelled in the layer node
            num_or_size_prods = []
            # Iterate through each node at the current depth of the SPN
            for node in depths[depth]:
                # Get input values and sizes of the product node
                input_values = list(node.values)
                input_sizes = list(node.get_input_sizes())
                if isinstance(node, PermuteProducts):
                    # Permute over input-values to model permuted products
                    input_values = permute_inputs(input_values, input_sizes)
                    node_num_prods = node.num_prods
                    prod_input_size = len(input_values) // node_num_prods
                elif isinstance(node, Product):
                    node_num_prods = 1
                    prod_input_size = int(sum(input_sizes))

                # Add Input values of the current node to the ProductsLayer node
                prods_layer.add_values(*input_values)
                # Add prod-input-size, of each product modelled in the current
                # node, to the list
                num_or_size_prods += [prod_input_size] * node_num_prods
                # Visit each parent of the current node
                for parent in parents[node]:
                    values = list(parent.values)
                    # Iterate through each input value of the current parent node
                    for i, value in enumerate(values):
                        # If the value is the current node
                        if value.node == node:
                            # Check if it has indices
                            if value.indices is not None:
                                # If so, then just add the num-prods of the
                                # layer-op as offset
                                indices = value.indices + layer_num_prods
                            else:
                                # If not, then create a list accrodingly
                                indices = list(
                                    range(layer_num_prods,
                                          (layer_num_prods + node_num_prods)))
                            # Replace previous (node) Input value in the
                            # current parent node, with the new layer-node value
                            values[i] = (prods_layer, indices)
                    # Reset values of the current parent node, by including
                    # the new child (Layer-node)
                    parent.set_values(*values)
                # Increment num-prods-counter of the layer node
                layer_num_prods += node_num_prods
                # Disconnect
                node.disconnect_inputs()

            # After all nodes at a certain depth are modelled into a Layer-node,
            # set num-prods parameter accordingly
            prods_layer.set_prod_sizes(num_or_size_prods)

        elif isinstance(depths[depth][0],
                        (SumsLayer, ProductsLayer, Concat)):  # A Concat node
            pass
        else:
            raise StructureError("Unknown node-type: {}".format(
                depths[depth][0]))

    return root
Пример #3
0
class _ProcedureWorker(Worker):
    def __init__(self, cl_environment, compile_flags, cl_function, kernel_data,
                 double_precision, use_local_reduction):
        super().__init__(cl_environment)
        self._cl_function = cl_function
        self._kernel_data = OrderedDict(sorted(kernel_data.items()))
        self._double_precision = double_precision
        self._use_local_reduction = use_local_reduction

        self._mot_float_dtype = np.float32
        if double_precision:
            self._mot_float_dtype = np.float64

        for data in self._kernel_data.values():
            data.set_mot_float_dtype(self._mot_float_dtype)

        self._kernel = self._build_kernel(self._get_kernel_source(),
                                          compile_flags)

        self._workgroup_size = self._kernel.run_procedure.get_work_group_info(
            cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
            self._cl_environment.device)
        if not self._use_local_reduction:
            self._workgroup_size = 1

        self._kernel_inputs = {
            name: data.get_kernel_inputs(self._cl_context,
                                         self._workgroup_size)
            for name, data in self._kernel_data.items()
        }

    def calculate(self, range_start, range_end):
        nmr_problems = range_end - range_start

        func = self._kernel.run_procedure
        func.set_scalar_arg_dtypes(self.get_scalar_arg_dtypes())

        kernel_inputs_list = []
        for inputs in [
                self._kernel_inputs[name] for name in self._kernel_data
        ]:
            kernel_inputs_list.extend(inputs)

        func(self._cl_queue, (int(nmr_problems * self._workgroup_size), ),
             (int(self._workgroup_size), ),
             *kernel_inputs_list,
             global_offset=(int(range_start * self._workgroup_size), ))

        for name, data in self._kernel_data.items():
            data.enqueue_readouts(self._cl_queue, self._kernel_inputs[name],
                                  range_start, range_end)

    def _build_kernel(self, kernel_source, compile_flags=()):
        """Convenience function for building the kernel for this worker.

        Args:
            kernel_source (str): the kernel source to use for building the kernel

        Returns:
            cl.Program: a compiled CL kernel
        """
        from mot import configuration
        if configuration.should_ignore_kernel_compile_warnings():
            warnings.simplefilter("ignore")
        return cl.Program(self._cl_context,
                          kernel_source).build(' '.join(compile_flags))

    def _get_kernel_source(self):
        assignment = ''
        if self._cl_function.get_return_type() != 'void':
            assignment = '__results[gid] = '

        variable_inits = []
        function_call_inputs = []
        post_function_callbacks = []
        for parameter in self._cl_function.get_parameters():
            data = self._kernel_data[parameter.name]
            call_args = (parameter.name, '_' + parameter.name, 'gid',
                         parameter.data_type.address_space)

            variable_inits.append(data.initialize_variable(*call_args))
            function_call_inputs.append(
                data.get_function_call_input(*call_args))
            post_function_callbacks.append(
                data.post_function_callback(*call_args))

        kernel_source = ''
        kernel_source += get_float_type_def(self._double_precision)
        kernel_source += '\n'.join(data.get_type_definitions()
                                   for data in self._kernel_data.values())
        kernel_source += self._cl_function.get_cl_code()
        kernel_source += '''
            __kernel void run_procedure(''' + ",\n".join(self._get_kernel_arguments()) + '''){
                ulong gid = (ulong)(get_global_id(0) / get_local_size(0));
                
                ''' + '\n'.join(variable_inits) + '''     
                
                ''' + assignment + ' ' + self._cl_function.get_cl_function_name() + '(' + \
                         ', '.join(function_call_inputs) + ''');
                
                ''' + '\n'.join(post_function_callbacks) + '''
            }
        '''
        return kernel_source

    def _get_kernel_arguments(self):
        """Get the list of kernel arguments for loading the kernel data elements into the kernel.

        This will use the sorted keys for looping through the kernel input items.

        Returns:
            list of str: the list of parameter definitions
        """
        declarations = []
        for name, data in self._kernel_data.items():
            declarations.extend(data.get_kernel_parameters('_' + name))
        return declarations

    def get_scalar_arg_dtypes(self):
        """Get the location and types of the input scalars.

        Returns:
            list: for every kernel input element either None if the data is a buffer or the numpy data type if
                if is a scalar.
        """
        dtypes = []
        for name, data in self._kernel_data.items():
            dtypes.extend(data.get_scalar_arg_dtypes())
        return dtypes
Пример #4
0
    def get_entitiesdata(self, datatype, since, sf):
        now = datetime.now(pytz.UTC)
        entities = []
        end = datetime.now(
            pytz.UTC)  # we need to use UTC as salesforce API requires this

        if since is None:
            result = []
            created_date_stmt = ""
            while True:
                query = "SELECT Id, CreatedDate FROM {} {} ORDER BY CreatedDate".format(
                    datatype, created_date_stmt)
                records = sf.query(query)["records"]
                temp_result = [x['Id'] for x in records]
                result.extend(temp_result)
                if records:
                    created_date_stmt = "WHERE CreatedDate > {}".format(
                        records[-1]['CreatedDate'])
                if len(temp_result) < 2000:  # salesforce limit 2000 rows
                    break
        else:
            start = iso8601.parse_date(since)
            logging.info("Since datetime presented: %s", start)
            logging.info("End -30 days delta: %s", (end - timedelta(days=30)))
            if start < (end - timedelta(days=30)
                        ):  # salesforce replicates only last 30 days
                logging.warning(
                    "Salesforce replicates only last 30 days but since is set to {}"
                    .format(start))
                start = datetime.now(
                    pytz.UTC) - timedelta(days=30) + timedelta(seconds=60)
                logging.warning("Changed since to {}".format(start))

            if getattr(sf, datatype):
                if end > (start + timedelta(seconds=60)):
                    result = getattr(sf, datatype).updated(start, end)["ids"]
                    deleted = getattr(sf,
                                      datatype).deleted(start,
                                                        end)["deletedRecords"]
                    for e in deleted:
                        c = OrderedDict({"_id": e["id"]})
                        c.update({"_updated": "%s" % e["deletedDate"]})
                        c.update({"_deleted": True})

                        entities.append(c)
        if result:
            for e in result:
                c = getattr(sf, datatype).get(e)
                c.update({"_id": e})
                c.update({"_updated": "%s" % c["LastModifiedDate"]})

                for property, value in c.items():
                    schema = [
                        item for item in self._entities[datatype]
                        if item["name"] == property
                    ]
                    if value and len(schema) > 0 and "type" in schema[
                            0] and schema[0]["type"] == "datetime":
                        c[property] = to_transit_datetime(parse(value))

                entities.append(c)
        return entities