def build_index(cls, sentence_embedder, dataset: UnlabeledBIODataset) -> TensorList: index = TensorList() for inst in dataset: sentence_embedding: torch.Tensor = sentence_embedder( sentence_ids=torch.Tensor([inst['id']]), dataset_ids=torch.Tensor([dataset.dataset_id]), ) index.append(sentence_embedding) return index
def test_append(self): tl = TensorList(tensor_list=[ torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), ]) assert len(tl) == 3 assert tl.shape == (3, TENSOR_EMBEDDING_DIM) tl.append(np.zeros((1, TENSOR_EMBEDDING_DIM))) assert len(tl) == 4 assert tl.shape == (4, TENSOR_EMBEDDING_DIM)
class LinearWindowFunction(WindowFunction): def __init__( self, positive_label: str, context_window: int, feature_extractor: FeatureExtractor, feature_summarizer: Callable[[List[Any]], torch.Tensor] = FeatureCollator.sum, linear_type: LinearType = LinearType.SVM_LINEAR, use_batch: bool = True, threshold: Optional[float] = 0.7, **kwargs, ): self.positive_label = positive_label self.feature_extractor = feature_extractor self.context_window = context_window super(LinearWindowFunction, self).__init__( positive_label, feature_extractor, context_window, use_batch=use_batch, threshold=threshold, **kwargs, ) self.dictionary = TensorList() self.labels = TensorList() self.feature_summarizer = feature_summarizer self.linear_model = construct_linear_classifier(linear_type=linear_type) @log_time(function_prefix='linear_window_train') def _train_model(self, training_data: List[Tuple[List[str], List[Any], str]]): for i, (sentence_window, feature_window, label) in enumerate(training_data): window_summary = self.feature_summarizer(feature_window) self.dictionary.append(window_summary) self.labels.append(torch.Tensor([label_index(label)])) x_train = self.dictionary.numpy() y_train = self.labels.numpy() x_train, y_train = balance_dataset(x_train, y_train) self.linear_model.fit(x_train, y_train) def _predict(self, features: List[torch.Tensor]) -> int: feature_summary = self.feature_summarizer(features).numpy() label: np.ndarray = self.linear_model.predict(feature_summary) return label.item() def _predict_probabilities(self, features: List[torch.Tensor]) -> float: feature_summary = self.feature_summarizer(features).numpy() confidence: np.ndarray = self.linear_model.decision_function(feature_summary) return confidence.item() @log_time(function_prefix='linear_window_snorkel_predict') def _batch_probabilities(self, features: List[List[torch.Tensor]]) -> List[float]: feature_summaries: List[np.ndarray] = list(map(lambda f: self.feature_summarizer(f).numpy(), features)) batch_np: np.ndarray = TensorList(feature_summaries).numpy() confidence_batch: np.ndarray = self.linear_model.decision_function(batch_np) return list(map(lambda conf: conf.item(), TensorList([confidence_batch]).to_list())) @log_time(function_prefix='linear_window_predict') def _batch_predict(self, features: List[List[torch.Tensor]]) -> List[int]: feature_summaries: List[np.ndarray] = list(map(lambda f: self.feature_summarizer(f).numpy(), features)) batch_np: np.ndarray = TensorList(feature_summaries).numpy() label_batch: np.ndarray = self.linear_model.predict(batch_np) return list(map(lambda label: label.item(), TensorList([label_batch]).to_list())) @overrides def __str__(self): return f'LinearWindowFunction({self.context_window})({self.feature_extractor})'
class BagWindowFunction(WindowFunction): def __init__( self, positive_label: str, context_window: int, feature_extractor: FeatureExtractor, feature_summarizer: Callable[[List[Any]], torch.Tensor] = FeatureCollator.sum, use_batch: bool = True, threshold: Optional[float] = 0.7, parallelize: bool = False, # shared memory issue locally use_sparse: bool = False, # store dictionary as sparse matrix **kwargs, ): self.positive_label = positive_label self.feature_extractor = feature_extractor self.context_window = context_window self.parallelize = parallelize super(BagWindowFunction, self).__init__( positive_label, feature_extractor, context_window, use_batch=use_batch, threshold=threshold, **kwargs, ) self.dictionary = SparseTensorList() if use_sparse else TensorList() self.labels = TensorList() self.feature_summarizer = feature_summarizer def _train_model(self, training_data: List[Tuple[List[str], List[Any], str]]): for i, (sentence_window, feature_window, label) in enumerate(training_data): if is_negative(label): continue window_summary = self.feature_summarizer(feature_window) self.dictionary.append(window_summary.float()) self.labels.append(torch.Tensor([label_index(label)])) def _predict(self, features: List[torch.Tensor]) -> int: feature_summary = self.feature_summarizer(features) labels = self.labels.tensor().long() found_index = self.dictionary.contains(feature_summary) if found_index == -1: return 0 # no confidence (should be ABSTAIN) label = labels[found_index] return label.item() def _predict_probabilities(self, features: List[torch.Tensor]) -> float: feature_summary = self.feature_summarizer(features) labels = self.labels.tensor().long() found_index = self.dictionary.contains(feature_summary) if found_index == -1: return 0. # no confidence (should be ABSTAIN) label = labels[found_index] return 2 * label.item() - 1 # (0 -> -1 ,1 -> 1) def _batch_predict(self, features: List[List[torch.Tensor]]) -> List[int]: return list(map(lambda f: self._predict(f), features)) def _batch_probabilities(self, features: List[List[torch.Tensor]]) -> List[float]: if self.parallelize: pool = multiprocessing.Pool() self.dictionary.share_memory() self.labels.share_memory() parallel_res = pool.map(self._predict_probabilities, features) return list(parallel_res) else: return list(map(self._predict_probabilities, features)) @overrides def __str__(self): return f'BagWindowFunction({self.context_window})({self.feature_extractor})'