async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: """ Uses trained data to make a prediction about the quality of a record. """ if not os.path.isfile(os.path.join(self.model_path)): raise ModelNotTrained("Train model before prediction.") self._model.eval() async for record in sources.with_features(self.features): feature_data = record.features(self.features)[self.features[0]] predict = await self.prediction_data_generator(feature_data) target = self.parent.config.predict.name # Disable gradient calculation for prediction with torch.no_grad(): for val in predict: val = val.to(self.device) output = self._model(val) if self.classifications: prob = torch.nn.functional.softmax(output, dim=1) confidence, prediction_value = prob.topk(1, dim=1) record.predicted( target, self.cids[prediction_value.item()], confidence, ) else: confidence = 1.0 - self.criterion(val, output).item() record.predicted(target, output, confidence) yield record
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: """ Uses trained data to make a prediction about the quality of a record. """ if not os.path.isfile( os.path.join(self.model_dir_path, "saved_model.pb")): raise ModelNotTrained("Train model before assessing for accuracy.") async for record in sources.with_features(self.features): feature_data = record.features(self.features) df = self.pd.DataFrame(feature_data, index=[0]) predict = await self.prediction_data_generator( self.np.array(df)[0]) all_prob = self._model.predict(predict) max_prob_idx = all_prob.argmax(axis=-1) target = self.parent.config.predict.name self.logger.debug("Predicted probability of {} for {}: {}".format( self.parent.config.predict.name, self.np.array(df)[0], all_prob[0], )) record.predicted( target, self.cids[max_prob_idx[0]], all_prob[0][max_prob_idx[0]], ) yield record
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: if not self._filepath.is_file(): raise ModelNotTrained("Train model before prediction.") async for record in sources.with_features(self.features): record_data = [] for feature in record.features(self.features).values(): record_data.extend( [feature] if self.np.isscalar(feature) else feature) predict = self.np.array([record_data]) self.logger.debug("Predicted Value of {} for {}: {}".format( self.parent.config.predict, predict, self.clf.predict(predict), )) target = self.parent.config.predict.name record.predicted( target, self.parent.config.predict.dtype(self.clf.predict(predict)[0]) if self.parent.config.predict.dtype is not str else self.clf.predict(predict)[0], self.confidence, ) yield record
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: if not self._filepath.is_file(): raise ModelNotTrained("Train model before prediction.") estimator_type = self.clf._estimator_type if estimator_type == "clusterer": if hasattr(self.clf, "predict"): # inductive clusterer predictor = self.clf.predict else: # transductive clusterer self.logger.critical( "Predict found transductive clusterer, ensure data being passed is training data" ) def yield_labels(): for label in self.clf.labels_.astype(self.np.int): yield label labels = yield_labels() predictor = lambda predict: [next(labels)] async for record in sources.with_features(self.features): feature_data = record.features(self.features) predict = self.np.array([list(feature_data.values())]) prediction = predictor(predict) self.logger.debug("Predicted cluster for {}: {}".format( predict, prediction)) target = self.parent.config.predict.name record.predicted(target, prediction[0], self.confidence) yield record
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: if not os.path.isfile( os.path.join(self.parent.config.output_dir, "tf_model.h5") ): raise ModelNotTrained("Train model before prediction.") self.tokenizer = AutoTokenizer.from_pretrained( self.parent.config.output_dir ) with self.parent.config.strategy.scope(): self.model = TFAutoModelForSequenceClassification.from_pretrained( self.parent.config.output_dir ) trainer = TFTrainer(model=self.model, args=self.parent.config,) async for record in sources.with_features(self.features): to_predict = record.features(self.features) eval_example = [ InputExample( 0, to_predict[self.features[0]], None, self.parent.config.label_list[0], ) ] eval_features = glue_convert_examples_to_features( eval_example, self.tokenizer, self.parent.config.max_seq_length, self.parent.config.task_name, self.parent.config.label_list, ) eval_dataset = await self.example_features_to_dataset( eval_features ) all_prob = trainer.predict(eval_dataset).predictions max_prob_idx = all_prob.argmax(axis=-1) self.logger.debug( "Predicted probability of {} for {}: {}".format( self.parent.config.predict.name, to_predict, all_prob[0], ) ) record.predicted( self.parent.config.predict.name, self.parent.config.label_list[max_prob_idx[0]], all_prob[0][max_prob_idx[0]], ) yield record
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: if not os.path.isfile(self._filename()): raise ModelNotTrained("Train model before prediction.") importance, tag, base, class_cost = None, None, None, None if self.parent.config.importance: importance = self.parent.config.importance.name if self.parent.config.tag: tag = self.parent.config.tag.name if self.parent.config.base: base = self.parent.config.base.name async for record in sources.with_features(self.features): feature_data = record.features( self.features + self.parent.config.extra_cols ) data = pd.DataFrame(feature_data, index=[0]) if not self.parent.config.noconvert: data = df_to_vw_format( data, vwcmd=self.parent.config.vwcmd, target=None, namespace=self.parent.config.namespace, importance=importance, tag=tag, base=base, task=self.parent.config.task, use_binary_label=self.parent.config.use_binary_label, ) else: data = ( data.drop(self.parent.config.extra_cols, axis=1) .to_numpy() .flatten() ) prediction = self.clf.predict(data[0]) self.logger.debug( "Predicted Value of {} for {}: {}".format( self.parent.config.predict.name, data, prediction, ) ) target = self.parent.config.predict.name record.predicted(target, prediction, self.confidence) yield record
async def predict_input_fn(self, sources: SourcesContext, **kwargs): """ Uses the numpy input function with data from record features. """ x_cols: Dict[str, Any] = {feature: [] for feature in self.features} ret_records = [] async for record in sources.with_features(self.features): ret_records.append(record) for feature, results in record.features(self.features).items(): x_cols[feature].append(self.np.array(results)) for feature in x_cols: x_cols[feature] = self.np.array(x_cols[feature]) self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("-----------------------") input_fn = self.tf.compat.v1.estimator.inputs.numpy_input_fn( x_cols, shuffle=False, num_epochs=1, **kwargs ) return input_fn, ret_records
async def score( self, mctx: ModelContext, sctx: SourcesContext, *features: Feature, ): if not mctx.is_trained: raise ModelNotTrained("Train model before assessing for accuracy.") if mctx.parent.clf._estimator_type not in ("classifier", "regressor"): raise ScorerWillNotWork( "SklearnModelAccuracy will not work with Clustering Models") is_multi = len(features) > 1 if is_multi: predictions = [feature.name for feature in features] elif len(features) == 1: (features, ) = features predictions = features.name xdata = [] ydata = [] async for record in sctx.with_features( list(mctx.np.hstack(mctx.features + [predictions]))): feature_data = [] predict_data = [] for feature in record.features(mctx.features).values(): feature_data.extend( [feature] if mctx.np.isscalar(feature) else feature) xdata.append(feature_data) if is_multi: for feature in record.features(predictions).values(): predict_data.extend( [feature] if mctx.np.isscalar(feature) else feature) else: predict_data = record.feature(predictions) ydata.append(predict_data) xdata = mctx.np.array(xdata) ydata = mctx.np.array(ydata) mctx.logger.debug("Number of input records: {}".format(len(xdata))) mctx.confidence = mctx.parent.clf.score(xdata, ydata) return mctx.confidence
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: if not os.path.isfile( os.path.join(self.parent.config.output_dir, "tf_model.h5") ): raise ModelNotTrained("Train model before prediction.") with self.parent.config.strategy.scope(): self.model = TFAutoModelForTokenClassification.from_pretrained( self.parent.config.output_dir, config=self.config, cache_dir=self.parent.config.cache_dir, ) async for record in sources.with_features( [self.parent.config.words.name] ): sentence = record.features([self.parent.config.words.name]) df = self.pd.DataFrame(sentence, index=[0]) test_dataset = self.get_dataset(df, self.tokenizer, mode="test",) trainer = TFTrainer( model=self.model, args=self.parent.config, train_dataset=None, eval_dataset=None, compute_metrics=self.compute_metrics, ) predictions, label_ids, _ = trainer.predict( test_dataset.get_dataset() ) preds_list, labels_list = self.align_predictions( predictions, label_ids ) preds = [ {word: preds_list[0][i]} for i, word in enumerate( sentence[self.parent.config.words.name].split() ) ] record.predicted(self.parent.config.predict.name, preds, "Nan") yield record
async def predict(self, sources: SourcesContext) -> AsyncIterator[Record]: target = self.parent.config.predict.name async for record in sources.with_features( self.parent.config.features.names()): record.predicted(target, random.random(), float(record.key)) yield record