def master_node(chunk_start: int, block: int, file_path: str,
                q: BaseProxy) -> None:
    """Module implementing process wrapper for individual task splits

    Performs aggregation on the file-splits it's operating upon.

    Parameters
    ----------
    chunk_start*: Pointer to the line from the file.
    block*: Explicit chunk size (dynamically provided by the user or through RunningConstants enum).
    file_path*: Support both relative, absolute references to the input file location.
    q*: Multiprocessing manager queue to support FIFO based execution of the processed chunks

    (* - Required parameters)

    Returns
    -------
    None
    """
    with open(file_path, 'r') as f:
        f.seek(chunk_start)
        lines = f.read(block).splitlines()
        process_dict = dict()
        for line in lines:
            if line:
                if 'Driver' in line:
                    # this if block may be redundant (if there's no requirement
                    # to validate Business_Rule-1 and Business_Rule-2 : Refer to
                    # section 'Possible Implementation' in Readme.md file)
                    if not line[7:] in process_dict:
                        process_dict[line[7:]] = (0, 0)
                elif line[:4] == 'Trip':
                    driver_name, distance, time_spent = slave_node(line[5:])
                    if len(driver_name):
                        # (Business_Rule-1 and Business_Rule-2 : Refer to
                        # section 'Possible Implementation' in Readme.md file))
                        if driver_name not in process_dict:
                            process_dict[driver_name] = (distance, time_spent)
                        else:
                            # Adding miles, time_spent to the existing record
                            # in the dictionary
                            current_distance, current_time = process_dict[
                                driver_name]
                            process_dict[driver_name] = (current_distance +
                                                         distance,
                                                         current_time +
                                                         time_spent)
        q.put(process_dict)
    return
Exemplo n.º 2
0
def prepare_jobs(batch_iterator, model_params, schema_params, num_features,
                 model_weights: dict, enable_local_indexing: bool,
                 job_queue: BaseProxy, has_intercept: bool):
    """
    Utility method to take batches of TF grouped data and convert it into one or more Jobs.
    Useful for running training and inference
    :param batch_iterator:        TF dataset feature, label batch iterator
    :param model_params:          model parameters to aid in converting to Job objects
    :param schema_params:         schema parameters to aid in converting to Job objects
    :param num_features           Number of features in global space
    :param model_weights:         Model coefficients, dict of {model_id: TrainingResult}
    :param enable_local_indexing: Whether to index the features locally instead of use global indices
    :param job_queue:             A managed queue containing the generated jobs
    :param has_intercept:         whether to include intercept in the model
    :return: a generator of entity_ids.

    The feature_bag is represented in sparse tensor format. Take per_member feature bag for example.
    The following batch has three records, two belonging to member #0 and one belonging to member #1.
        member #0 has two records
            per_member_indices = [[0, 7, 60, 80, 95], [34, 57]]
            per_member_values = [[1.0, 2.0, 3.0, 5.0, 6.6], [1.0, 2.0]]
        member #1 has one record
            per_member_indices = [[10, 11]]
            per_member_values = [[-3.5, 2.3]]
    The batch combines both members' records:
        per_member_indices = [[[0, 7, 60, 80, 95], [34, 57]], [[10, 11]]]
        per_member_values = [[[1.0, 2.0, 3.0, 5.0, 6.6], [1.0, 2.0]], [[-3.5, 2.3]]]
    Tensorflow representation of the batch above:
        SparseTensorValue(indices=array(
        [[0, 0, 0],
        [0, 0, 1],
        [0, 0, 2],
        [0, 0, 3],
        [0, 0, 4],
        [0, 1, 0],
        [0, 1, 1],
        [1, 0, 0],
        [1, 0, 1]]), values=array([ 1. ,  2. ,  3. ,  5. ,  6.6,  1. ,  2. , -3.5,  2.3],
        dtype=float32), dense_shape=array([2, 2, 5]))
        Note the first dimension is the batch dimension.
    """
    logger.info(
        f"Kicking off job producer with enable_local_indexing = {enable_local_indexing}."
    )
    for features_val, labels_val in dataset_reader(batch_iterator()):
        # Extract number of entities in batch
        num_entities = features_val[model_params.partition_entity].shape[0]

        # Now, construct entity_id, X, y, offsets and weights
        X_index = 0
        y_index = 0
        for entity in range(num_entities):
            ids_indices = features_val[schema_params.uid_column_name].indices
            rows = ids_indices[np.where(ids_indices[:, 0] == entity)][:, 1]
            sample_count_from_ids = rows.size
            if model_params.feature_bag is None:
                # intercept only model
                assert (num_features == 1)
                sample_count = sample_count_from_ids
                values = np.zeros(sample_count)
                cols = np.zeros(sample_count, dtype=int)
            else:
                # Construct data matrix X. Slice portion of arrays from X_index
                # through the number of rows for the entity
                features = features_val[model_params.feature_bag +
                                        INDICES_SUFFIX]
                indices = features.indices
                rows = indices[np.where(indices[:, 0] == entity)][:, 1]
                cols = features.values[X_index:X_index + len(rows)]
                values = features_val[model_params.feature_bag +
                                      VALUES_SUFFIX].values[X_index:X_index +
                                                            len(rows)]

                # Get sample count
                sample_count = np.amax(rows) + 1

                # sanity check
                assert (sample_count == sample_count_from_ids)

            # Construct entity ID
            raw_entity_id = features_val[model_params.partition_entity][entity]
            if isinstance(raw_entity_id, bytes):
                entity_id = raw_entity_id.decode('utf-8')
            else:
                entity_id = str(raw_entity_id)
            result = model_weights.get(entity_id, None)

            # generate index map
            unique_global_indices, locally_indexed_cols = np.unique(
                cols, return_inverse=True)

            if enable_local_indexing:
                # Use local indices to represent the data matrix.
                X = coo_matrix((values, (rows, locally_indexed_cols)))
            else:
                # Use global indices to represent the data matrix.
                X = coo_matrix((values, (rows, cols)),
                               shape=(sample_count, num_features))

            # Construct y, offsets, weights and ids. Slice portion of arrays from y_index through sample_count
            y = labels_val[
                schema_params.label_column_name].values[y_index:y_index +
                                                        sample_count]
            offsets = features_val[
                model_params.offset_column_name].values[y_index:y_index +
                                                        sample_count]
            weights = (features_val[schema_params.weight_column_name].
                       values[y_index:y_index +
                              sample_count] if schema_params.weight_column_name
                       in features_val else np.ones(sample_count))

            ids = features_val[
                schema_params.uid_column_name].values[y_index:y_index +
                                                      sample_count]

            # If a prior model exists, get the coefficients to warm start the training.
            # Note the prior model may have fewer or more features than the current dataset.
            theta = None
            if result:
                model_rows = []
                model_values = []
                coeffs_without_intercept = result.theta[
                    1:] if has_intercept else result.theta
                prior_model = {
                    u: v
                    for u, v in zip(result.unique_global_indices,
                                    coeffs_without_intercept)
                }
                idx_offset = 0
                if has_intercept:
                    # account for the intercept term
                    model_rows.append(0)
                    model_values.append(result.theta[0])
                    idx_offset = 1  # account for the intercept
                for i, u in enumerate(unique_global_indices):
                    if u in prior_model:
                        r = i if enable_local_indexing else u
                        model_rows.append(
                            idx_offset + r)  # +1 if bias is the first element.
                        model_values.append(prior_model[u])
                model_cols = [0] * len(model_rows)
                if enable_local_indexing:
                    # +1 if bias is used
                    coeffs_length = len(
                        unique_global_indices) + 1 if has_intercept else len(
                            unique_global_indices)
                    theta = csr_matrix(
                        (model_values, (model_rows, model_cols)),
                        shape=(coeffs_length, 1))
                else:
                    # +1 if bias is used
                    coeffs_length = num_features + 1 if has_intercept else num_features
                    theta = csr_matrix(
                        (model_values, (model_rows, model_cols)),
                        shape=(coeffs_length, 1))
            job = Job(entity_id,
                      X,
                      y,
                      offsets,
                      weights,
                      ids,
                      unique_global_indices,
                      theta=theta)
            job_queue.put(job)
            # use entity_id as a token, it may not be unique
            yield entity_id

            # Update X_index and y_index
            y_index += sample_count
            X_index += len(rows)
Exemplo n.º 3
0
    def _run_command(self,
                     cmd: list[str],
                     execution_time: list[float],
                     gpu_queue: BaseProxy = None) -> bool:
        """Run the encoding and decoding command. Based on the `debug`
        flag, it will raise the ``CalledProcessError`` exception or add
        the count of failures silently. Error messages will log into 
        files with timestamp in ``logs/``.
        
        Parameters
        ----------
        cmd : `list[str]`
            Command to execute. In the same format with what subprocess
            use.
        execution_time : `list[float]`
            A list to store the execution_time.
        gpu_queue : `BaseProxy`, optional
            A multiprocessing Manager.Queue() object. The queue stores 
            the GPU device IDs get from GPUtil.getAvailable(). Must be 
            assigned if running a PCC algorithm using GPUs. Defaults to 
            None.
        
        Returns
        -------
        `bool`
            True is successfully executed, False otherwise.
        
        Raises
        ------
        `e`
            Exception ``subprocess.CalledProcessError``.
        """
        if gpu_queue is not None:
            gpu_id = gpu_queue.get()
            # Inject environment variable `CUDA_VISIBLE_DEVICES` to ``cmd``.
            env = dict(os.environ, CUDA_VISIBLE_DEVICES=str(gpu_id))
        else:
            env = os.environ

        try:
            start_time = time.time()
            ret = sp.run(cmd,
                         cwd=self._algs_cfg['rootdir'],
                         env=env,
                         capture_output=True,
                         check=True)
            end_time = time.time()
        except sp.CalledProcessError as e:
            timestamp = datetime.datetime.now().strftime('%Y%m%d_%H:%M:%S')
            log_file = (Path(__file__).parents[1].joinpath(
                f'logs/execute_cmd_{timestamp}.log'))

            with open(log_file, 'w') as f:
                lines = [
                    f"The stdout and stderr of executed command: ",
                    f"{''.join(str(s)+' ' for s in cmd)}",
                    "\n",
                    "===== stdout =====",
                    f"{e.stdout.decode('utf-8')}",
                    "\n",
                    "===== stderr =====",
                    f"{e.stderr.decode('utf-8')}",
                ]
                f.writelines('\n'.join(lines))

            logger.error(f"Error occurs when executing command: ",
                         f"{''.join(str(s)+' ' for s in cmd)}", "\n"
                         f"Check {log_file} for more informations.")

            if self._debug is True:
                raise e
            else:
                self._failure_cnt += 1
                return False
        else:
            execution_time[0] = end_time - start_time
            return True

        finally:
            if gpu_queue is not None:
                gpu_queue.put(gpu_id)