def hard_negative_multilabel(self): """Hard Negative Sampling based on multilabel assumption Search the negative sample with largest distance (smallest sim) with the anchor within self._k negative samplels """ # During early iterations of sampling, use random sampling instead if self._iteration <= self._n: return self.random_multilabel() anchor_class_id, negative_class_id = np.random.choice( self._index.keys(), 2) anchor_id, positive_id = np.random.choice( self._index[anchor_class_id], 2) negative_ids = np.random.choice( self._index[negative_class_id], self._k) # calcualte the smallest simlarity one with negatives anchor_label = parse_label(self._labels[anchor_id]) positive_label = parse_label(self._labels[positive_id]) negative_labels = [parse_label(self._labels[negative_id]) for negative_id in negative_ids] p_sim = intersect_sim(anchor_label, positive_label) n_sims = np.array( [intersect_sim(anchor_label, negative_label) for negative_label in negative_labels]) min_sim_id = np.argmin(n_sims) negative_id = negative_ids[min_sim_id] n_sim = n_sims[min_sim_id] margin = p_sim - n_sim return (anchor_id, positive_id, negative_id, margin)
def hard_negative_multilabel(self): """Hard Negative Sampling based on multilabel assumption Search the negative sample with largest distance (smallest sim) with the anchor within self._k negative samplels """ # During early iterations of sampling, use random sampling instead if self._iteration <= self._n: return self.random_multilabel() anchor_class_id, negative_class_id = np.random.choice( self._index.keys(), 2) anchor_id, positive_id = np.random.choice(self._index[anchor_class_id], 2) negative_ids = np.random.choice(self._index[negative_class_id], self._k) # calcualte the smallest simlarity one with negatives anchor_label = parse_label(self._labels[anchor_id]) positive_label = parse_label(self._labels[positive_id]) negative_labels = [ parse_label(self._labels[negative_id]) for negative_id in negative_ids ] p_sim = intersect_sim(anchor_label, positive_label) n_sims = np.array([ intersect_sim(anchor_label, negative_label) for negative_label in negative_labels ]) min_sim_id = np.argmin(n_sims) negative_id = negative_ids[min_sim_id] n_sim = n_sims[min_sim_id] margin = p_sim - n_sim return (anchor_id, positive_id, negative_id, margin)
def random_multilabel(self): """Random Sampling under the assumption of multilabels All are similar to random sampling the difference is to involve a distance ofsimilarity measurements in addition. Or equvilent, as a margin of anchor sample between positive and negative """ anchor_id, positive_id, negative_id = self.random_sampling() # calculate the distance of similarity score / margin anchor_label = parse_label(self._labels[anchor_id]) positive_label = parse_label(self._labels[positive_id]) negative_label = parse_label(self._labels[negative_id]) p_sim = intersect_sim(anchor_label, positive_label) n_sim = intersect_sim(anchor_label, negative_label) margin = p_sim - n_sim return (anchor_id, positive_id, negative_id, margin)
def calculate_label_dim(self): """Calculate the dimension of labels by calculating the lenth of label set """ all_labels = [] for label_str in self._label: label = parse_label(label_str) all_labels += label all_labels = set(all_labels) self._label_dim = len(all_labels)
def _build_index(self): """Build Index to randomly fetch samples from data The index is in the format of python dict {label: [list of sample id]} """ self._sample_count = len(self._labels) self._index = dict() for id in range(self._sample_count): # parse label and insert into self._index labels_ = parse_label(self._labels[id]) for label_ in labels_: if label_ in self._index.keys(): self._index[label_].append(id) else: self._index[label_] = [id]
def get_a_datum(self): if self._compressed: datum = extract_sample(self._data[self._cur], self._mean, self._resize) else: datum = self._data[self._cur] # start parsing labels label_elems = parse_label(self._label[self._cur]) label = np.zeros(self._label_dim) if not self._multilabel: label[0] = label_elems[0] else: for i in label_elems: label[i] = 1 self._cur = (self._cur + 1) % self._sample_count return datum, label
def get_a_datum(self): if self._compressed: datum = extract_sample( self._data[self._cur], self._mean, self._resize) else: datum = self._data[self._cur] # start parsing labels label_elems = parse_label(self._label[self._cur]) label = np.zeros(self._label_dim) if not self._multilabel: label[0] = label_elems[0] else: for i in label_elems: label[i] = 1 self._cur = (self._cur + 1) % self._sample_count return datum, label