def __init__( self, positive_label: str, context_window: int, feature_extractor: FeatureExtractor, feature_summarizer: Callable[[List[Any]], torch.Tensor] = FeatureCollator.sum, linear_type: LinearType = LinearType.SVM_LINEAR, use_batch: bool = True, threshold: Optional[float] = 0.7, **kwargs, ): self.positive_label = positive_label self.feature_extractor = feature_extractor self.context_window = context_window super(LinearWindowFunction, self).__init__( positive_label, feature_extractor, context_window, use_batch=use_batch, threshold=threshold, **kwargs, ) self.dictionary = TensorList() self.labels = TensorList() self.feature_summarizer = feature_summarizer self.linear_model = construct_linear_classifier(linear_type=linear_type)
def extract_features( data: AnnotatedDataType, dataset_id: int, shuffle: bool, feature_extractor: Callable[[AnnotationType], torch.Tensor], ): positive_set: TensorList = TensorList() negative_set: TensorList = TensorList() for entry in data: tags: List[str] = entry['output'] features: torch.Tensor = feature_extractor(entry) pos_idx, neg_idx = get_label_index(tags) positive_set.append(features[pos_idx]) negative_set.append(features[neg_idx]) positive_set: np.ndarray = positive_set.numpy() negative_set: np.ndarray = negative_set.numpy() positive_labels: np.ndarray = np.zeros((len(positive_set), )) positive_labels.fill(1) negative_labels: np.ndarray = np.zeros((len(negative_set))) x_train, y_train = construct_train_data( pos_data=positive_set, neg_data=negative_set, pos_labels=positive_labels, neg_labels=negative_labels, shuffle=shuffle, ) return x_train, y_train
def __init__( self, positive_label: str, context_window: int, feature_extractor: FeatureExtractor, feature_summarizer: Callable[[List[Any]], torch.Tensor] = FeatureCollator.sum, use_batch: bool = True, threshold: Optional[float] = 0.7, parallelize: bool = False, # shared memory issue locally use_sparse: bool = False, # store dictionary as sparse matrix **kwargs, ): self.positive_label = positive_label self.feature_extractor = feature_extractor self.context_window = context_window self.parallelize = parallelize super(BagWindowFunction, self).__init__( positive_label, feature_extractor, context_window, use_batch=use_batch, threshold=threshold, **kwargs, ) self.dictionary = SparseTensorList() if use_sparse else TensorList() self.labels = TensorList() self.feature_summarizer = feature_summarizer
def test_constructor_numpy(self): tl = TensorList(tensor_list=[ np.zeros((1, TENSOR_EMBEDDING_DIM)), np.zeros((1, TENSOR_EMBEDDING_DIM)), np.zeros((1, TENSOR_EMBEDDING_DIM)), ]) assert len(tl) == 3 assert tl.shape == (3, TENSOR_EMBEDDING_DIM)
def test_constructor_tensor(self): tl = TensorList(tensor_list=[ torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), ]) assert len(tl) == 3 assert tl.shape == (3, TENSOR_EMBEDDING_DIM)
def build_index(cls, sentence_embedder, dataset: UnlabeledBIODataset) -> TensorList: index = TensorList() for inst in dataset: sentence_embedding: torch.Tensor = sentence_embedder( sentence_ids=torch.Tensor([inst['id']]), dataset_ids=torch.Tensor([dataset.dataset_id]), ) index.append(sentence_embedding) return index
def test_numpy(self): def _create_list(): return [ torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), ] tl = TensorList(tensor_list=_create_list()) assert type(tl.numpy()) == np.ndarray
def test_tensor(self): def _create_list(): return [ torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), ] tl = TensorList(tensor_list=_create_list()) assert type(tl.tensor()) == torch.Tensor
def test_append(self): tl = TensorList(tensor_list=[ torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), ]) assert len(tl) == 3 assert tl.shape == (3, TENSOR_EMBEDDING_DIM) tl.append(np.zeros((1, TENSOR_EMBEDDING_DIM))) assert len(tl) == 4 assert tl.shape == (4, TENSOR_EMBEDDING_DIM)
def construct_train_data( pos_data: np.ndarray, neg_data: np.ndarray, pos_labels: np.ndarray, neg_labels: np.ndarray, shuffle: Optional[bool] = False, ) -> Tuple[np.ndarray, np.ndarray]: train_data: TensorList = TensorList() train_labels: TensorList = TensorList() train_data.append(pos_data) train_data.append(neg_data) train_labels.append(pos_labels) train_labels.append(neg_labels) if shuffle: x = train_data.tensor() y = train_labels.tensor() idx = torch.randperm(len(x)) return x[idx].numpy(), y[idx].numpy() else: return train_data.numpy(), train_labels.numpy()
def test_tensor_list(self): def _create_list(): return [ torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), ] list_tensor = _create_list() tl = TensorList(tensor_list=list_tensor) created_list = tl.to_list() assert type(created_list) == type(list_tensor) assert all( (t1 == t2).all() for t1, t2 in zip(list_tensor, created_list))
def test_extend(self): def _create_list(): return [ torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), ] tl = TensorList(tensor_list=_create_list()) assert len(tl) == 3 assert tl.shape == (3, TENSOR_EMBEDDING_DIM) tl.extend(_create_list()) assert len(tl) == 6 assert tl.shape == (6, TENSOR_EMBEDDING_DIM)
def test_contains_tensor(self): def _create_list(): return [ torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), torch.zeros(1, TENSOR_EMBEDDING_DIM), ] list_tensor = _create_list() tl = TensorList(tensor_list=list_tensor) found_index = tl.contains(torch.zeros(1, TENSOR_EMBEDDING_DIM)) assert found_index == 0 found_index = tl.contains(torch.zeros(1, TENSOR_EMBEDDING_DIM) + 1) assert found_index == -1
def _batch_predict(self, features: List[List[torch.Tensor]]) -> List[int]: feature_summaries: List[np.ndarray] = list(map(lambda f: self.feature_summarizer(f).numpy(), features)) batch_np: np.ndarray = TensorList(feature_summaries).numpy() label_batch: np.ndarray = self.linear_model.predict(batch_np) return list(map(lambda label: label.item(), TensorList([label_batch]).to_list()))
def _batch_probabilities(self, features: List[List[torch.Tensor]]) -> List[float]: feature_summaries: List[np.ndarray] = list(map(lambda f: self.feature_summarizer(f).numpy(), features)) batch_np: np.ndarray = TensorList(feature_summaries).numpy() confidence_batch: np.ndarray = self.linear_model.decision_function(batch_np) return list(map(lambda conf: conf.item(), TensorList([confidence_batch]).to_list()))
def concat(cls, features: List[torch.Tensor]) -> torch.Tensor: tl = TensorList(features) return tl.tensor().reshape(1, -1)
def test_empty_construct(self): tl = TensorList() assert len(tl) == 0 assert tl.shape == (0, )