def generate_negatives(self, dataset1: 'Dataset', dataset2: 'Dataset', score_function: Callable, num_of_negatives: int = -1, range_in_gt: bool = False): """ Args: dataset1 (Dataset): dataset2 (Dataset): score_function (Callable): num_of_negatives (int, optional): Number of negatives to generate. Default is -1 which will generate same number of negatives to positives. range_in_gt (bool, optional): The negatives will be generated within the range of ids in ground truth if it's True, otherwise range will be the cross product of two datasets. Default is False. """ num_of_negatives = len( self) if num_of_negatives == -1 else num_of_negatives max_heap = [] for r1, r2 in get_record_pairs(dataset1, dataset2): if not self.is_member(r1.id, r2.id) and \ (not range_in_gt or (r1.id in self._gt_id1s and r2.id in self._gt_id2s)): s = score_function(r1, r2) heapq.heappush(max_heap, (s, r1.id, r2.id)) if len(max_heap) > num_of_negatives: heapq.heappop(max_heap) for d in max_heap: r1_id, r2_id = d[1], d[2] self.add_negative(r1_id, r2_id)
def generate_negatives(self, dataset1: 'Dataset', dataset2: 'Dataset', score_function: Callable, num_of_negatives: int = -1, range_in_gt: bool = False, exclude_from: 'GroundTruth' = None): """ Args: dataset1 (Dataset): Dataset 1. dataset2 (Dataset): Dataset 2. score_function (Callable): User function, inputs are two :meth:`rltk.record.Record` s, return is a float. num_of_negatives (int, optional): Number of negatives to generate. Default is -1 which will generate same number of negatives to positives. range_in_gt (bool, optional): The negatives will be generated within the range of ids in ground truth if it's True, otherwise range will be the cross product of two datasets. Default is False. exclude_from (GroundTruth, optional): Exclude the id pair which appears in this ground truth. Defaults to None. This is especially useful when generating negatives for test set \ meanwhile the pairs in train set need to be excluded. """ num_of_negatives = len(self) if num_of_negatives == -1 else num_of_negatives max_heap = [] for r1, r2 in get_record_pairs(dataset1, dataset2): if not self.is_member(r1.id, r2.id) and \ (not exclude_from or not exclude_from.is_member(r1.id, r2.id)) and \ (not range_in_gt or (r1.id in self._gt_id1s and r2.id in self._gt_id2s)): s = score_function(r1, r2) heapq.heappush(max_heap, (s, r1.id, r2.id)) if len(max_heap) > num_of_negatives: heapq.heappop(max_heap) for d in max_heap: r1_id, r2_id = d[1], d[2] self.add_negative(r1_id, r2_id)
def generate_all_negatives(self, dataset1: 'Dataset', dataset2: 'Dataset', range_in_gt: bool = False): """ Args: dataset1 (Dataset): dataset2 (Dataset): range_in_gt (bool, optional): """ for r1, r2 in get_record_pairs(dataset1, dataset2): if not self.is_member(r1.id, r2.id) and \ (not range_in_gt or (r1.id in self._gt_id1s and r2.id in self._gt_id2s)): self.add_negative(r1.id, r2.id)
def generate_all_negatives(self, dataset1: 'Dataset', dataset2: 'Dataset', range_in_gt: bool = False): """ Args: dataset1 (Dataset): Dataset 1. dataset2 (Dataset): Dataset 2. range_in_gt (bool, optional): The negatives will be generated within the range of ids in ground truth if it's True, otherwise range will be the cross product of two datasets. Default is False. """ for r1, r2 in get_record_pairs(dataset1, dataset2): if not self.is_member(r1.id, r2.id) and \ (not range_in_gt or (r1.id in self._gt_id1s and r2.id in self._gt_id2s)): self.add_negative(r1.id, r2.id)
def generate_all_negatives(self, dataset1: 'Dataset', dataset2: 'Dataset', range_in_gt: bool = False, exclude_from: 'GroundTruth' = None): """ Args: dataset1 (Dataset): Dataset 1. dataset2 (Dataset): Dataset 2. range_in_gt (bool, optional): The negatives will be generated within the range of ids in ground truth if it's True, otherwise range will be the cross product of two datasets. Default is False. exclude_from (GroundTruth, optional): Exclude the id pair which appears in this ground truth. Defaults to None. This is especially useful when generating negatives for test set \ meanwhile the pairs in train set need to be excluded. """ for r1, r2 in get_record_pairs(dataset1, dataset2): if not self.is_member(r1.id, r2.id) and \ (not exclude_from or not exclude_from.is_member(r1.id, r2.id)) and \ (not range_in_gt or (r1.id in self._gt_id1s and r2.id in self._gt_id2s)): self.add_negative(r1.id, r2.id)
def generate_stratified_negatives(self, dataset1: 'Dataset', dataset2: 'Dataset', classify: Callable, num_of_strata: int, random_seed: int = None, num_of_negatives: int = -1, range_in_gt: bool = False, exclude_from: 'GroundTruth' = None): """ Args: dataset1 (Dataset): Dataset 1. dataset2 (Dataset): Dataset 2. classify (Callable): User function, inputs are two :meth:`rltk.record.Record` s, return is an integer which identify which stratum the pair belongs to. The return integer should be in range [0, num_of_strata). num_of_strata (int): Number of strata. random_seed (int, optional): The seed used by :py:meth:`random.seed`. num_of_negatives (int, optional): Number of negatives to generate. Default is -1 which will generate same number of negatives to positives. range_in_gt (bool, optional): The negatives will be generated within the range of ids in ground truth if it's True, otherwise range will be the cross product of two datasets. Default is False. exclude_from (GroundTruth, optional): Exclude the id pair which appears in this ground truth. Defaults to None. This is especially useful when generating negatives for test set \ meanwhile the pairs in train set need to be excluded. """ # add positives and negatives to different clusters strata = [{'p': [], 'n': []} for _ in range(num_of_strata)] # build strata for r1, r2 in get_record_pairs(dataset1, dataset2): if (range_in_gt and not (r1.id in self._gt_id1s and r2.id in self._gt_id2s)) or \ (exclude_from and exclude_from.is_member(r1.id, r2.id)): continue stratum_id = classify(r1, r2) p_n = 'p' if self.is_member(r1.id, r2.id) else 'n' strata[stratum_id][p_n].append((r1.id, r2.id)) # compute weights: p / n strata_weights = {} for idx, s in enumerate(strata): stratum_id = str(idx) # nothing to pick if len(s['p']) == 0 or len(s['n']) == 0: strata_weights[stratum_id] = 0.0 continue strata_weights[stratum_id] = float(len(s['p'])) / len(s['n']) # sorting sorted_strata_weights = OrderedDict( sorted(strata_weights.items(), key=itemgetter(1), reverse=True)) # find out the number of negatives to pick from each stratum total_num = sum([len(s['p']) for s in strata ]) if num_of_negatives == -1 else num_of_negatives num_to_pick_from_each_stratum = [0] * num_of_strata curr_strata_weights = copy.deepcopy(sorted_strata_weights) for stratum_id in sorted_strata_weights.keys(): if total_num <= 0 or len(curr_strata_weights) == 0: break weight = sorted_strata_weights[stratum_id] idx = int(stratum_id) # normalize weights denominator = sum([w for w in curr_strata_weights.values()]) num_to_pick_from_each_stratum[idx] = \ min(round(total_num * weight / denominator), len(strata[idx]['n'])) # prep for next round total_num -= num_to_pick_from_each_stratum[idx] curr_strata_weights.popitem(last=False) # pick negatives if random_seed: random.seed(random_seed) for idx, num in enumerate(num_to_pick_from_each_stratum): negs = random.sample(strata[idx]['n'], num) for n in negs: self.add_negative(n[0], n[1])
def generate_stratified_negatives(self, dataset1: 'Dataset', dataset2: 'Dataset', classify: Callable, num_of_strata: int, random_seed: int = None, num_of_negatives: int = -1, range_in_gt: bool = False): """ Args: dataset1 (Dataset): dataset2 (Dataset): classify (Callable): num_of_strata (int): random_seed (int, optional): num_of_negatives (int, optional): range_in_gt (bool, optional): """ # add positives and negatives to different clusters strata = [{'p': [], 'n': []} for _ in range(num_of_strata)] # build strata for r1, r2 in get_record_pairs(dataset1, dataset2): if range_in_gt and not (r1.id in self._gt_id1s and r2.id in self._gt_id2s): continue stratum_id = classify(r1, r2) p_n = 'p' if self.is_member(r1.id, r2.id) else 'n' strata[stratum_id][p_n].append((r1.id, r2.id)) # compute weights: p / n strata_weights = {} for idx, s in enumerate(strata): stratum_id = str(idx) # nothing to pick if s['p'] == 0 or s['n'] == 0: strata_weights[stratum_id] = 0.0 continue strata_weights[stratum_id] = float(len(s['p'])) / len(s['n']) # sorting sorted_strata_weights = OrderedDict( sorted(strata_weights.items(), key=itemgetter(1), reverse=True)) # find out the number of negatives to pick from each stratum total_num = sum([len(s['p']) for s in strata ]) if num_of_negatives == -1 else num_of_negatives num_to_pick_from_each_stratum = [0] * num_of_strata curr_strata_weights = copy.deepcopy(sorted_strata_weights) for stratum_id in sorted_strata_weights.keys(): if total_num <= 0 or len(curr_strata_weights) == 0: break weight = sorted_strata_weights[stratum_id] idx = int(stratum_id) # normalize weights denominator = sum([w for w in curr_strata_weights.values()]) num_to_pick_from_each_stratum[idx] = \ min(round(total_num * weight / denominator), len(strata[idx]['n'])) # prep for next round total_num -= num_to_pick_from_each_stratum[idx] curr_strata_weights.popitem(last=False) # pick negatives if random_seed: random.seed(random_seed) for idx, num in enumerate(num_to_pick_from_each_stratum): negs = random.sample(strata[idx]['n'], num) for n in negs: self.add_negative(n[0], n[1])