Пример #1
0
    async def train(self, sources: Sources) -> None:
        """
        Trains and saves a model using the source data, and the config attributes
        """
        # Get data into memory
        xdata = []
        ydata = []
        async for record in sources.with_features(
                self.features + [self.parent.config.predict.name]):
            record_data = []
            for feature in record.features(self.features).values():
                record_data.extend(
                    [feature] if np.isscalar(feature) else feature)
            xdata.append(record_data)
            ydata.append(record.feature(self.parent.config.predict.name))
        x_data = pd.DataFrame(xdata)
        y_data = pd.DataFrame(ydata)

        self.saved = XGBClassifier(
            n_estimators=self.config.n_estimators,
            learning_rate=self.config.learning_rate,
            max_depth=self.config.max_depth,
            objective=self.config.objective,
            subsample=self.config.subsample,
            gamma=self.config.gamma,
            n_jobs=self.config.n_jobs,
            colsample_bytree=self.config.colsample_bytree,
            booster=self.config.booster,
            min_child_weight=self.config.min_child_weight,
            reg_lambda=self.config.reg_lambda,
            reg_alpha=self.config.reg_alpha,
        )

        self.saved.fit(x_data, y_data, eval_metric="merror")
        self.is_trained = True
Пример #2
0
 async def get_input_data(self, sources: Sources) -> list:
     saved_records = []
     async for record in sources.with_features(
         self.config.features.names()
     ):
         saved_records.append(record)
     return saved_records
Пример #3
0
 async def accuracy_input_fn(self, sources: Sources, **kwargs):
     """
     Uses the numpy input function with data from repo features.
     """
     x_cols: Dict[str, Any] = {feature: [] for feature in self.features}
     y_cols = []
     for repo in [
             repo async for repo in sources.with_features(
                 self.features + [self.parent.config.predict.NAME])
             if repo.feature(self.parent.config.predict.NAME) in
             self.classifications
     ]:
         for feature, results in repo.features(self.features).items():
             x_cols[feature].append(np.array(results))
         y_cols.append(self.classifications[repo.feature(
             self.parent.config.predict.NAME)])
     y_cols = np.array(y_cols)
     for feature in x_cols:
         x_cols[feature] = np.array(x_cols[feature])
     self.logger.info("------ Repo Data ------")
     self.logger.info("x_cols:    %d", len(list(x_cols.values())[0]))
     self.logger.info("y_cols:    %d", len(y_cols))
     self.logger.info("-----------------------")
     input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
         x_cols,
         y_cols,
         batch_size=self.parent.config.batchsize,
         shuffle=self.parent.config.shuffle,
         num_epochs=1,
         **kwargs,
     )
     return input_fn
Пример #4
0
 async def train(self, sources: Sources):
     xdata = []
     ydata = []
     ### np.hstack helps flatten the lists wihtout splitting strings.
     async for record in sources.with_features(
         list(self.np.hstack(self.features + [self.predictions]))):
         feature_data = []
         predict_data = []
         for feature in record.features(self.features).values():
             feature_data.extend(
                 [feature] if self.np.isscalar(feature) else feature)
         xdata.append(feature_data)
         if self.is_multi:
             for feature in record.features(self.predictions).values():
                 predict_data.extend(
                     [feature] if self.np.isscalar(feature) else feature)
         else:
             predict_data = record.feature(self.predictions)
         ydata.append(predict_data)
     xdata = self.np.array(xdata)
     ydata = self.np.array(ydata)
     self.logger.info("Number of input records: {}".format(len(xdata)))
     if (self.is_multi
             and "MultiOutput" not in self.parent.clf.__class__.__name__):
         if self.estimator_type == "regressor":
             self.parent.clf = MultiOutputRegressor(self.parent.clf)
         elif self.estimator_type == "classifier":
             self.parent.clf = MultiOutputClassifier(self.parent.clf)
         else:
             raise NoMultiOutputSupport(
                 "Model does not support multi-output. Please refer the docs to find a suitable model entrypoint."
             )
     self.parent.clf.fit(xdata, ydata)
     self.is_trained = True
Пример #5
0
    async def evaluate_input_fn(
        self,
        sources: Sources,
        batch_size=20,
        shuffle=False,
        epochs=1,
        **kwargs,
    ):
        """
        Uses the numpy input function with data from repo features.
        """
        x_cols: Dict[str, Any] = {feature: [] for feature in self.features}
        y_cols = []

        async for repo in sources.with_features(self.all_features):
            for feature, results in repo.features(self.features).items():
                x_cols[feature].append(np.array(results))
            y_cols.append(repo.feature(self.parent.config.predict))

        y_cols = np.array(y_cols)
        for feature in x_cols:
            x_cols[feature] = np.array(x_cols[feature])
        self.logger.info("------ Repo Data ------")
        self.logger.info("x_cols:    %d", len(list(x_cols.values())[0]))
        self.logger.info("y_cols:    %d", len(y_cols))
        self.logger.info("-----------------------")
        input_fn = tensorflow.estimator.inputs.numpy_input_fn(
            x_cols,
            y_cols,
            batch_size=batch_size,
            shuffle=shuffle,
            num_epochs=epochs,
            **kwargs,
        )
        return input_fn
Пример #6
0
    async def sources_to_array(self, sources: Sources):
        x_cols: Dict[str, Any] = {feature: [] for feature in self.features}
        y_cols = []
        for record in [
            record
            async for record in sources.with_features(
                self.features + [self.parent.config.predict.name]
            )
            if self.parent.config.clstype(
                record.feature(self.parent.config.predict.name)
            )
            in self.classifications
        ]:
            for feature, results in record.features(self.features).items():
                x_cols[feature].append(self.np.array(results))
            y_cols.append(
                self.classifications[
                    self.parent.config.clstype(
                        record.feature(self.parent.config.predict.name)
                    )
                ]
            )
        if not y_cols:
            raise ValueError("No records to train on")
        y_cols = self.np.array(y_cols)
        for feature in x_cols:
            x_cols[feature] = self.np.array(x_cols[feature])

        return x_cols, y_cols
Пример #7
0
 async def train(self, sources: Sources):
     xdata = []
     async for record in sources.with_features(self.features):
         feature_data = record.features(self.features)
         xdata.append(list(feature_data.values()))
     xdata = self.np.array(xdata)
     self.logger.info("Number of input records: {}".format(len(xdata)))
     self.parent.clf.fit(xdata)
     self.is_trained = True
Пример #8
0
 async def train(self, sources: Sources):
     async for repo in sources.with_features(
             self.features + [self.parent.config.predict.NAME]):
         feature_data = repo.features(self.features +
                                      [self.parent.config.predict.NAME])
         self.xData = np.append(self.xData, feature_data[self.features[0]])
         self.yData = np.append(
             self.yData, feature_data[self.parent.config.predict.NAME])
     self.regression_line = await self.best_fit_line()
Пример #9
0
 async def train(self, sources: Sources):
     xdata = []
     async for record in sources.with_features(self.features):
         feature_data = record.features(self.features)
         xdata.append(list(feature_data.values()))
     xdata = self.np.array(xdata)
     self.logger.info("Number of input records: {}".format(len(xdata)))
     self.clf.fit(xdata)
     self.joblib.dump(self.clf, str(self._filepath))
Пример #10
0
    async def accuracy(self, sources: Sources) -> Accuracy:
        if not os.path.isfile(self._filename()):
            raise ModelNotTrained("Train model before assessing for accuracy.")
        data = []
        importance, tag, base, class_cost = None, None, None, None
        if self.parent.config.importance:
            importance = self.parent.config.importance.name

        if self.parent.config.tag:
            tag = self.parent.config.tag.name

        if self.parent.config.base:
            base = self.parent.config.base.name
        async for record in sources.with_features(self.features):
            feature_data = record.features(
                self.features
                + [self.parent.config.predict.name]
                + self.parent.config.extra_cols
            )
            data.append(feature_data)
        df = pd.DataFrame(data)
        xdata = df.drop([self.parent.config.predict.name], 1)
        self.logger.debug("Number of input records: {}".format(len(xdata)))
        if not self.parent.config.noconvert:
            xdata = df_to_vw_format(
                xdata,
                vwcmd=self.parent.config.vwcmd,
                target=None,
                namespace=self.parent.config.namespace,
                importance=importance,
                tag=tag,
                base=base,
                task=self.parent.config.task,
                use_binary_label=self.parent.config.use_binary_label,
            )
        else:
            xdata = (
                xdata.drop(self.parent.config.extra_cols, axis=1)
                .to_numpy()
                .flatten()
            )
        ydata = np.array(df[self.parent.config.predict.name])
        shape = [len(xdata)]
        # TODO support probabilites
        # if 'oaa' in self.parent.config.vwcmd and 'probabilities' in self.parent.config.vwcmd:
        #     shape.append(self.parent.config.vwcmd['oaa'])
        y_pred = np.empty(shape)
        for idx, x in enumerate(xdata):
            y_pred[idx] = self.clf.predict(x)

        if self.parent.config.task in ["regression"]:
            self.confidence = r2_score(ydata, y_pred)
        elif self.parent.config.task in ["classification"]:
            self.confidence = accuracy_score(ydata, y_pred)
        self.logger.debug("Model Accuracy: {}".format(self.confidence))
        return self.confidence
Пример #11
0
 async def train(self, sources: Sources):
     data = []
     async for repo in sources.with_features(self.features):
         feature_data = repo.features(self.features)
         data.append(feature_data)
     df = pd.DataFrame(data)
     xdata = np.array(df)
     self.logger.info("Number of input repos: {}".format(len(xdata)))
     self.clf.fit(xdata)
     joblib.dump(self.clf, self._filename())
Пример #12
0
 async def train(self, sources: Sources):
     data = []
     async for record in sources.with_features(self.features):
         feature_data = record.features(self.features)
         data.append(feature_data)
     df = self.pd.DataFrame(data)
     xdata = self.np.array(df)
     self.logger.info("Number of input records: {}".format(len(xdata)))
     self.clf.fit(xdata)
     self.joblib.dump(self.clf, str(self._filepath))
Пример #13
0
    async def train_data_generator(self, sources: Sources):

        self.logger.debug("Training on features: %r", self.parent.features)
        x_cols: Dict[str, Any] = {
            feature: [] for feature in self.parent.features
        }
        y_cols = []
        all_records = []
        all_sources = sources.with_features(
            self.parent.features + [self.classification]
        )
        async for record in all_sources:
            if record.feature(self.classification) in self.classifications:
                all_records.append(record)
        for record in all_records:
            for feature, results in record.features(
                self.parent.features
            ).items():
                x_cols[feature].append(self.np.array(results))
            y_cols.append(
                self.classifications[record.feature(self.classification)]
            )
        if not y_cols:
            raise ValueError("No records to train on")
        y_cols = self.np.array(y_cols)
        for feature in x_cols:
            x_cols[feature] = self.np.array(x_cols[feature])
        self.logger.info("------ Record Data ------")
        self.logger.info("x_cols:    %d", len(list(x_cols.values())[0]))
        self.logger.info("y_cols:    %d", len(y_cols))
        self.logger.info("-----------------------")

        if (len(self.parent.features)) > 1:
            self.logger.critical(
                "Found more than one feature to train on. Only first feature will be used"
            )
        # TODO add more embedTypes
        # so far only model available on tensorflow hub which requires special input preprocessing is `bert`
        if self.parent.config.embedType in ["bert"]:
            x_cols = bert_tokenizer(
                x_cols[self.parent.features[0]],
                self.parent.config.max_seq_length,
                self.parent._model.vocab_file.asset_path.numpy(),
                self.parent._model.do_lower_case.numpy(),
            )
            x_cols = dict(
                input_word_ids=x_cols[0],
                input_mask=x_cols[1],
                segment_ids=x_cols[2],
            )
        else:
            # Universal Sentence Encoder, Neural Network Language Model, Swivel Embeddings
            # No preprocessing needed
            x_cols = x_cols[self.parent.features[0]]
        return x_cols, y_cols
Пример #14
0
    async def train(self, sources: Sources):
        data = []
        importance, tag, base, class_cost = None, None, None, None
        if self.parent.config.importance:
            importance = self.parent.config.importance.name

        if self.parent.config.tag:
            tag = self.parent.config.tag.name

        if self.parent.config.base:
            base = self.parent.config.base.name
        if self.parent.config.class_cost:
            class_cost = [
                feature.name for feature in self.parent.config.class_cost
            ]
        async for record in sources.with_features(
                self.parent.features + [self.parent.config.predict.name] +
                self.parent.config.extra_cols):
            feature_data = record.features(self.parent.features +
                                           [self.parent.config.predict.name] +
                                           self.parent.config.extra_cols)
            data.append(feature_data)
        vw_data = pd.DataFrame(data)
        if not self.parent.config.noconvert:
            vw_data = df_to_vw_format(
                vw_data,
                vwcmd=self.parent.config.vwcmd,
                target=self.parent.config.predict.name,
                namespace=self.parent.config.namespace,
                importance=importance,
                tag=tag,
                base=base,
                task=self.parent.config.task,
                use_binary_label=self.parent.config.use_binary_label,
                class_cost=class_cost,
            )
        # support data already in vw format
        # append `predict` to `features`
        else:
            if len(self.parent.features) > 1:
                raise InputError(
                    "Training features should be in vw format or `noconvert` should be false."
                )
            vw_data = (vw_data[self.parent.config.predict.name].map(str) +
                       " " + vw_data[self.parent.features[0]].map(str))
        self.logger.info("Number of input records: {}".format(len(vw_data)))
        for n in range(self.parent.config.passes):
            if n > 1:
                X = shuffle(vw_data)
            else:
                X = vw_data
            for x in X:
                self.parent.clf.learn(x)
        self.is_trained = True
Пример #15
0
 async def train(self, sources: Sources):
     data = []
     async for repo in sources.with_features(self.features):
         feature_data = repo.features(self.features +
                                      [self.parent.config.predict])
         data.append(feature_data)
     df = pd.DataFrame(data)
     xdata = np.array(df.drop([self.parent.config.predict], 1))
     ydata = np.array(df[self.parent.config.predict])
     self.logger.info("Number of input repos: {}".format(len(xdata)))
     self.clf.fit(xdata, ydata)
     joblib.dump(self.clf, self._filename())
    async def score(
        self, mctx: ModelContext, sctx: Sources, features: Feature
    ):
        # Load saved anomalies
        anomalies = mctx.storage.get("anomalies", None)
        # Ensure the model has been trained before we try to make a prediction
        if not mctx.is_trained:
            raise ModelNotTrained("Train model before assessing for accuracy.")

        epsilon, _F1val, mu, sigma2 = anomalies

        X = []
        Y = []
        # Go through all records that have the feature we're training on and the
        # feature we want to predict.
        async for record in sctx.with_features(
            mctx.features + [features.name]
        ):
            record_data = []
            for feature in record.features(mctx.features).values():
                record_data.extend(
                    [feature] if np.isscalar(feature) else feature
                )

            X.append(record_data)
            Y.append(record.feature(features.name))

        mctx.logger.debug("Number of test records: %d", len(X))

        # Number of features
        nof = len(mctx.features)

        X = np.reshape(X, (len(X), nof))

        Y = np.reshape(Y, (len(Y), 1))

        mu = np.array(mu)
        sigma2 = np.array(sigma2)
        p = multivariateGaussian(X, mu, sigma2)

        pred = (p < epsilon).astype(int)

        F1 = getF1(Y, pred)

        outliers = p < epsilon

        listOfOl = findIndices(outliers)

        accuracy = F1
        # Update the accuracy
        mctx.storage["anomalies"] = epsilon, F1, mu.tolist(), sigma2.tolist()
        return accuracy
Пример #17
0
 async def accuracy(self, sources: Sources) -> Accuracy:
     data = []
     async for repo in sources.with_features(self.features):
         feature_data = repo.features(self.features +
                                      [self.parent.config.predict])
         data.append(feature_data)
     df = pd.DataFrame(data)
     xdata = np.array(df.drop([self.parent.config.predict], 1))
     ydata = np.array(df[self.parent.config.predict])
     self.logger.debug("Number of input repos: {}".format(len(xdata)))
     self.confidence = self.clf.score(xdata, ydata)
     self.logger.debug("Model Accuracy: {}".format(self.confidence))
     return self.confidence
Пример #18
0
 async def _preprocess_data(self, sources: Sources):
     all_examples = []
     all_sources = sources.with_features([
         "sentence",
         "entities",
     ])
     async for record in all_sources:
         all_examples.append((
             record.feature("sentence"),
             {
                 "entities": record.feature("entities")
             },
         ))
     return all_examples
Пример #19
0
    async def dataset_generator(self, sources: Sources):
        """
        Get data from source and convert into Tensor format for further processing
        """
        self.logger.debug("Training on features: %r", self.features)
        x_cols: Dict[str, Any] = {feature: [] for feature in self.features}
        y_cols = []
        all_records = []
        all_sources = sources.with_features(
            self.features + [self.parent.config.predict.name]
        )

        async for record in all_sources:
            for feature, results in record.features(self.features).items():
                x_cols[feature].append(np.array(results))
            y_cols.append(
                self.classifications[
                    record.feature(self.parent.config.predict.name)
                ]
                if self.classifications
                else record.feature(self.parent.config.predict.name)
            )
        if (len(self.features)) > 1:
            self.logger.critical(
                "Found more than one feature to train on. Only first feature will be used"
            )
        if not y_cols:
            raise ValueError("No records to train on")

        y_cols = np.array(y_cols)
        for feature in x_cols:
            x_cols[feature] = np.array(x_cols[feature])

        self.logger.info("------ Record Data ------")
        self.logger.info("x_cols:    %d", len(list(x_cols.values())[0]))
        self.logger.info("y_cols:    %d", len(y_cols))
        self.logger.info("-----------------------")

        x_cols = x_cols[self.features[0]]
        # Convert x and y data to tensors and normalize them accordingly
        dataset = NumpyToTensor(
            x_cols,
            y_cols,
            size=self.parent.config.imageSize,
            norm_mean=self.parent.config.normalize_mean,
            norm_std=self.parent.config.normalize_std,
        )

        return dataset, len(dataset)
Пример #20
0
 async def train(self, sources: Sources):
     data = []
     async for record in sources.with_features(
         self.features + [self.parent.config.predict.NAME]
     ):
         feature_data = record.features(
             self.features + [self.parent.config.predict.NAME]
         )
         data.append(feature_data)
     df = self.pd.DataFrame(data)
     xdata = self.np.array(df.drop([self.parent.config.predict.NAME], 1))
     ydata = self.np.array(df[self.parent.config.predict.NAME])
     self.logger.info("Number of input records: {}".format(len(xdata)))
     self.clf.fit(xdata, ydata)
     self.joblib.dump(self.clf, str(self._filepath))
Пример #21
0
 async def train(self, sources: Sources):
     data = []
     async for repo in sources.with_features(
         self.features + [self.parent.config.predict.NAME]
     ):
         feature_data = repo.features(
             self.features + [self.parent.config.predict.NAME]
         )
         slice_ = [feature_data[data] for data in self.features]
         data.append(slice_)
         self.yData = np.append(
             self.yData, feature_data[self.parent.config.predict.NAME]
         )
     self.xData = np.asarray(data, dtype=float).reshape(-1, len(self.features))
     await self.best_fit_line()
Пример #22
0
    async def sources_to_array(self, sources: Sources):
        x_cols: Dict[str, Any] = {feature: [] for feature in self.features}
        y_cols = []

        async for record in sources.with_features(self.all_features):
            for feature, results in record.features(self.features).items():

                x_cols[feature].append(self.np.array(results))
            y_cols.append(record.feature(self.parent.config.predict.name))

        y_cols = self.np.array(y_cols)
        for feature in x_cols:
            x_cols[feature] = self.np.array(x_cols[feature])

        return x_cols, y_cols
Пример #23
0
 async def accuracy(self, sources: Sources) -> Accuracy:
     if not self._filepath.is_file():
         raise ModelNotTrained("Train model before assessing for accuracy.")
     data = []
     async for record in sources.with_features(self.features):
         feature_data = record.features(self.features +
                                        [self.parent.config.predict.name])
         data.append(feature_data)
     df = self.pd.DataFrame(data)
     xdata = self.np.array(df.drop([self.parent.config.predict.name], 1))
     ydata = self.np.array(df[self.parent.config.predict.name])
     self.logger.debug("Number of input records: {}".format(len(xdata)))
     self.confidence = self.clf.score(xdata, ydata)
     self.logger.debug("Model Accuracy: {}".format(self.confidence))
     return self.confidence
Пример #24
0
 async def accuracy(self, sources: Sources) -> Accuracy:
     if not self._filepath.is_file():
         raise ModelNotTrained("Train model before assessing for accuracy.")
     xdata = []
     ydata = []
     target = []
     estimator_type = self.clf._estimator_type
     if estimator_type == "clusterer":
         target = (
             []
             if self.parent.config.tcluster is None
             else [self.parent.config.tcluster.name]
         )
     async for record in sources.with_features(self.features):
         feature_data = record.features(self.features)
         xdata.append(list(feature_data.values()))
         ydata.append(list(record.features(target).values()))
     xdata = self.np.array(xdata)
     self.logger.debug("Number of input records: {}".format(len(xdata)))
     if target:
         ydata = self.np.array(ydata).flatten()
         if hasattr(self.clf, "predict"):
             # xdata can be training data or unseen data
             # inductive clusterer with ground truth
             y_pred = self.clf.predict(xdata)
             self.confidence = mutual_info_score(ydata, y_pred)
         else:
             # requires xdata = training data
             # transductive clusterer with ground truth
             self.logger.critical(
                 "Accuracy found transductive clusterer, ensure data being passed is training data"
             )
             self.confidence = mutual_info_score(ydata, self.clf.labels_)
     else:
         if hasattr(self.clf, "predict"):
             # xdata can be training data or unseen data
             # inductive clusterer without ground truth
             y_pred = self.clf.predict(xdata)
             self.confidence = silhouette_score(xdata, y_pred)
         else:
             # requires xdata = training data
             # transductive clusterer without ground truth
             self.logger.critical(
                 "Accuracy found transductive clusterer, ensure data being passed is training data"
             )
             self.confidence = silhouette_score(xdata, self.clf.labels_)
     self.logger.debug("Model Accuracy: {}".format(self.confidence))
     return self.confidence
Пример #25
0
 async def train(self, sources: Sources):
     xdata = []
     ydata = []
     async for record in sources.with_features(
             self.features + [self.parent.config.predict.name]):
         record_data = []
         for feature in record.features(self.features).values():
             record_data.extend(
                 [feature] if self.np.isscalar(feature) else feature)
         xdata.append(record_data)
         ydata.append(record.feature(self.parent.config.predict.name))
     xdata = self.np.array(xdata)
     ydata = self.np.array(ydata)
     self.logger.info("Number of input records: {}".format(len(xdata)))
     self.clf.fit(xdata, ydata)
     self.joblib.dump(self.clf, str(self._filepath))
Пример #26
0
 async def training_input_fn(
     self,
     sources: Sources,
     batch_size=20,
     shuffle=False,
     epochs=1,
     **kwargs,
 ):
     """
     Uses the numpy input function with data from repo features.
     """
     self.logger.debug("Training on features: %r", self.features)
     x_cols: Dict[str, Any] = {feature: [] for feature in self.features}
     y_cols = []
     for repo in [
         repo
         async for repo in sources.with_features(
             self.features + [self.parent.config.predict.NAME]
         )
         if repo.feature(self.parent.config.predict.NAME)
         in self.classifications
     ]:
         for feature, results in repo.features(self.features).items():
             x_cols[feature].append(np.array(results))
         y_cols.append(
             self.classifications[
                 repo.feature(self.parent.config.predict.NAME)
             ]
         )
     if not y_cols:
         raise ValueError("No repos to train on")
     y_cols = np.array(y_cols)
     for feature in x_cols:
         x_cols[feature] = np.array(x_cols[feature])
     self.logger.info("------ Repo Data ------")
     self.logger.info("x_cols:    %d", len(list(x_cols.values())[0]))
     self.logger.info("y_cols:    %d", len(y_cols))
     self.logger.info("-----------------------")
     input_fn = tensorflow.estimator.inputs.numpy_input_fn(
         x_cols,
         y_cols,
         batch_size=batch_size,
         shuffle=shuffle,
         num_epochs=epochs,
         **kwargs,
     )
     return input_fn
Пример #27
0
    async def train(self, sources: Sources):
        data = []
        importance, tag, base, class_cost = None, None, None, None
        if self.parent.config.importance:
            importance = self.parent.config.importance.name

        if self.parent.config.tag:
            tag = self.parent.config.tag.name

        if self.parent.config.base:
            base = self.parent.config.base.name
        if self.parent.config.class_cost:
            class_cost = [
                feature.name for feature in self.parent.config.class_cost
            ]
        async for record in sources.with_features(
                self.features + [self.parent.config.predict.name] +
                self.parent.config.extra_cols):
            feature_data = record.features(self.features +
                                           [self.parent.config.predict.name] +
                                           self.parent.config.extra_cols)
            data.append(feature_data)
        vw_data = pd.DataFrame(data)
        if self.parent.config.convert_to_vw:
            vw_data = df_to_vw_format(
                vw_data,
                vwcmd=self.parent.config.vwcmd,
                target=self.parent.config.predict.name,
                namespace=self.parent.config.namespace,
                importance=importance,
                tag=tag,
                base=base,
                task=self.parent.config.task,
                use_binary_label=self.parent.config.use_binary_label,
                class_cost=class_cost,
            )
        self.logger.info("Number of input records: {}".format(len(vw_data)))
        for n in range(self.parent.config.passes):
            if n > 1:
                X = shuffle(vw_data)
            else:
                X = vw_data
            for x in X:
                self.clf.learn(x)
        self._save_model()
Пример #28
0
 async def accuracy(self, sources: Sources) -> Accuracy:
     if not os.path.isfile(self._filename()):
         raise ModelNotTrained("Train model before assessing for accuracy.")
     data = []
     target = []
     estimator_type = self.clf._estimator_type
     if estimator_type is "clusterer":
         target = ([] if self.parent.config.tcluster is None else
                   [self.parent.config.tcluster.NAME])
     async for repo in sources.with_features(self.features):
         feature_data = repo.features(self.features + target)
         data.append(feature_data)
     df = pd.DataFrame(data)
     xdata = np.array(df.drop(target, axis=1))
     self.logger.debug("Number of input repos: {}".format(len(xdata)))
     if target:
         ydata = np.array(df[target]).flatten()
         if hasattr(self.clf, "predict"):
             # xdata can be training data or unseen data
             # inductive clusterer with ground truth
             y_pred = self.clf.predict(xdata)
             self.confidence = mutual_info_score(ydata, y_pred)
         else:
             # requires xdata = training data
             # transductive clusterer with ground truth
             self.logger.critical(
                 "Accuracy found transductive clusterer, ensure data being passed is training data"
             )
             self.confidence = mutual_info_score(ydata, self.clf.labels_)
     else:
         if hasattr(self.clf, "predict"):
             # xdata can be training data or unseen data
             # inductive clusterer without ground truth
             y_pred = self.clf.predict(xdata)
             self.confidence = silhouette_score(xdata, y_pred)
         else:
             # requires xdata = training data
             # transductive clusterer without ground truth
             self.logger.critical(
                 "Accuracy found transductive clusterer, ensure data being passed is training data"
             )
             self.confidence = silhouette_score(xdata, self.clf.labels_)
     self.logger.debug("Model Accuracy: {}".format(self.confidence))
     return self.confidence
Пример #29
0
 async def accuracy(self, sources: Sources) -> Accuracy:
     if not self._filepath.is_file():
         raise ModelNotTrained("Train model before assessing for accuracy.")
     xdata = []
     ydata = []
     async for record in sources.with_features(
             self.features + [self.parent.config.predict.name]):
         record_data = []
         for feature in record.features(self.features).values():
             record_data.extend(
                 [feature] if self.np.isscalar(feature) else feature)
         xdata.append(record_data)
         ydata.append(record.feature(self.parent.config.predict.name))
     xdata = self.np.array(xdata)
     ydata = self.np.array(ydata)
     self.logger.debug("Number of input records: {}".format(len(xdata)))
     self.confidence = self.clf.score(xdata, ydata)
     self.logger.debug("Model Accuracy: {}".format(self.confidence))
     return self.confidence
Пример #30
0
    async def _preprocess_data(self, sources: Sources):
        x_cols: Dict[str, Any] = {
            feature: []
            for feature in (
                [self.parent.config.sid.name, self.parent.config.words.name]
            )
        }
        y_cols = []
        all_records = []
        all_sources = sources.with_features(
            [
                self.parent.config.sid.name,
                self.parent.config.words.name,
                self.parent.config.predict.name,
            ]
        )
        async for record in all_sources:
            if (
                record.feature(self.parent.config.predict.name)
                in self.parent.config.ner_tags
            ):
                all_records.append(record)
        for record in all_records:
            for feature, results in record.features(
                [self.parent.config.sid.name, self.parent.config.words.name]
            ).items():
                x_cols[feature].append(self.np.array(results))
            y_cols.append(record.feature(self.parent.config.predict.name))
        if not y_cols:
            raise ValueError("No records to train on")
        y_cols = self.np.array(y_cols)
        for feature in x_cols:
            x_cols[feature] = self.np.array(x_cols[feature])

        self.logger.info("------ Record Data ------")
        self.logger.info("x_cols:    %d", len(list(x_cols.values())[0]))
        self.logger.info("y_cols:    %d", len(y_cols))
        self.logger.info("-----------------------")
        df = self.pd.DataFrame.from_dict(x_cols)
        df[self.parent.config.predict.name] = y_cols
        return df