예제 #1
0
def hungarian_tracking(video, cost=200, nms_overlap_fraction=0.6):

    getId = IntegerIDGen()
    # Supress bodies
    non_max_supression_video(video, nms_overlap_fraction)

    for i, body in enumerate(video[0].valid_bodies):
        body.set_id(getId())
        video._tracks[body.id] = Track(body)


#             print(body)

    for i in tqdm(range(len(video) - 1)):
        current_frame = video[i].valid_bodies
        next_frame = video[i + 1].valid_bodies

        cmap = cost_matrix_tracks_skeleton(current_frame, next_frame, cost)
        _, idx = hungarian(cmap)

        for j in range(len(current_frame)):
            if cmap[j, idx[j]] < cost:

                # Create New ID
                if current_frame[j].id == -1:

                    current_frame[j].set_id(getId())
                    video._tracks[current_frame[j].id] = Track(
                        current_frame[j])

                # Match Next Frame Detections
                next_frame[idx[j]].set_id(current_frame[j].id)
                next_frame[idx[j]].prev = current_frame[j]
                current_frame[j].next = next_frame[idx[j]]
    return
예제 #2
0
파일: metric.py 프로젝트: vseledkin/L2C
 def optimal_assignment(self,gt_n_cluster=None,assign=None):
     if assign is None:
         mat = -self.conf.cpu().numpy() #hungaian finds the minimum cost
         r,assign = hungarian(mat)
     self.conf = self.conf[:,assign]
     self.gt_n_cluster = gt_n_cluster
     return self
예제 #3
0
def compute_hungarian(m):
  assert m.size()[0] == m.size()[1]
  m_numpy = m.cpu().detach().numpy()
  row, col = hungarian(m_numpy)
  matrix = np.zeros(m.size())
  matrix[row, col] = 1. / float(len(m))
  cost = (matrix * m_numpy).sum()
  return cost, torch.tensor(matrix), col
예제 #4
0
        def count_disambiguations(tags0, kws0, candidate_keywords_per_tag, n_kws, n_tags):
            """Receives an initial (fixed) assignment of tags to keywords and computes how many disambiguations are solved
            current_map is a dictionary mapping tag to sets of candidate keywords"""


            current_map = candidate_keywords_per_tag.copy()
            for tag_id, kw_id in zip(tags0, kws0):
                current_map[tag_id] = [kw_id]

            ambiguous_tags = [tag_id for tag_id, candidates in current_map.items() if len(candidates) > 1]
            known_tags = [tag_id for tag_id, candidates in current_map.items() if len(candidates) == 1]
            fresh_pass = False
            while not fresh_pass and len(ambiguous_tags) > 1:
                fresh_pass = True
                for tag_id in ambiguous_tags:
                    current_candidates = current_map[tag_id]
                    new_candidates = []
                    for candidate_kw_id in current_candidates:
                        check = [m_matrix[current_map[known_tag][0]][candidate_kw_id] - window <= m_obs_matrix[known_tag][tag_id] <=
                                 m_matrix[current_map[known_tag][0]][candidate_kw_id] + window for known_tag in known_tags]
                        if all(check):
                            new_candidates.append(candidate_kw_id)

                    if len(current_candidates) > len(new_candidates):
                        fresh_pass = False
                        current_map[tag_id] = new_candidates
                        # print("  Removed {:d} candidates".format(len(current_candidates) - len(new_candidates)))

                for tag_id in ambiguous_tags:
                    if len(current_map[tag_id]) == 1:
                        ambiguous_tags.remove(tag_id)
                        known_tags.append(tag_id)
                        # print("  tag {:d} is disambiguated".format(tag_id))
                        for tag_id_others in ambiguous_tags:
                            if tag_id != tag_id_others and current_map[tag_id][0] in current_map[tag_id_others]:
                                current_map[tag_id_others].remove(current_map[tag_id][0])
                    elif len(current_map[tag_id]) == 0:
                        # print("  Inconsistency!")
                        return 0, None  # Inconsistency

            n_disambiguations = len(known_tags)

            cost_matrix = np.ones((n_kws, n_tags))
            for i_tag in range(n_tags):
                for candidate_kw_id in current_map[i_tag]:
                    cost_matrix[candidate_kw_id, i_tag] = 0
            row_ind, col_ind = hungarian(cost_matrix)
            if cost_matrix[row_ind, col_ind].sum() > 0:
                # print("  There was no consistent matching!")
                return 0, None

            query_predictions_for_each_tag = {}
            for tag, keyword in zip(col_ind, row_ind):
                query_predictions_for_each_tag[tag] = keyword

            # print("  This matching has {:d} disambiguations, returning...".format(n_disambiguations))
            return n_disambiguations, query_predictions_for_each_tag
예제 #5
0
 def maximumANDSum(self, nums, numSlots):
     """
     :type nums: List[int]
     :type numSlots: int
     :rtype: int
     """
     adj = [[
         -((nums[i] if i < len(nums) else 0) & (1 + x // 2))
         for x in xrange(2 * numSlots)
     ] for i in xrange(2 * numSlots)]
     return -sum(adj[i][j] for i, j in itertools.izip(*hungarian(adj)))
예제 #6
0
    def _run_hungarian_attack_given_matrix(self, c_matrix):
        """Runs the Hungarian algorithm with the given cost matrix
        :param c_matrix: cost matrix, (n_keywords x n_tags)"""

        row_ind, col_ind = hungarian(c_matrix)

        query_predictions_for_each_tag = {}
        for tag, keyword in zip(col_ind, row_ind):
            query_predictions_for_each_tag[tag] = keyword

        return query_predictions_for_each_tag
예제 #7
0
def calc_wasserstein(Dj, Mk):
    # calculates the 2-Wasserstein L2 distance between two diagrams
    # inputs: diagram Dj, centre Mk
    # returns: W_2(Dj, Mk)
    m = len(Dj)
    c = calc_cost_matrix(Dj, Mk)
    X = hungarian(c)
    total = 0
    for i in range(m):
        total += c[X[0][i]][X[1][i]]
    return math.sqrt(total)
예제 #8
0
파일: npext.py 프로젝트: cortu01/mpctools
def maximise_trace(x):
    """
    Maximise the Trace of a SQUARE Matrix X using the Hungarian Algorithm

    :param x: Numpy 2D SQUARE Array
    :return: Tuple containing (in order):
                * optimal permutation of columns to achieve a maximal trace
                * size of this trace
    """
    _rows, _cols = hungarian(np.full(len(x), np.max(x)) - x)
    return _cols, x[_rows, _cols].sum()
예제 #9
0
    def maximumANDSum(self, nums, numSlots):
        """
        :type nums: List[int]
        :type numSlots: int
        :rtype: int
        """

        # Template translated from:
        # https://github.com/kth-competitive-programming/kactl/blob/main/content/graph/WeightedMatching.h
        def hungarian(a):  # Time: O(n^2 * m), Space: O(n + m)
            if not a:
                return 0, []
            n, m = len(a) + 1, len(a[0]) + 1
            u, v, p, ans = [0] * n, [0] * m, [0] * m, [0] * (n - 1)
            for i in xrange(1, n):
                p[0] = i
                j0 = 0  # add "dummy" worker 0
                dist, pre = [float("inf")] * m, [-1] * m
                done = [False] * (m + 1)
                while True:  # dijkstra
                    done[j0] = True
                    i0, j1, delta = p[j0], None, float("inf")
                    for j in xrange(1, m):
                        if done[j]:
                            continue
                        cur = a[i0 - 1][j - 1] - u[i0] - v[j]
                        if cur < dist[j]:
                            dist[j], pre[j] = cur, j0
                        if dist[j] < delta:
                            delta, j1 = dist[j], j
                    for j in xrange(m):
                        if done[j]:
                            u[p[j]] += delta
                            v[j] -= delta
                        else:
                            dist[j] -= delta
                    j0 = j1
                    if not p[j0]:
                        break
                while j0:  # update alternating path
                    j1 = pre[j0]
                    p[j0], j0 = p[j1], j1
            for j in xrange(1, m):
                if p[j]:
                    ans[p[j] - 1] = j - 1
            return -v[0], ans  # min cost

        return -hungarian([[
            -((nums[i] if i < len(nums) else 0) & (1 + x // 2))
            for x in xrange(2 * numSlots)
        ] for i in xrange(2 * numSlots)])[0]
예제 #10
0
def matchIds(bboxes, predbboxes):
    a = (bboxes[:, 0:2] + bboxes[:, 2:4]) / 2
    b = (predbboxes[:, 0:2] + predbboxes[:, 2:4]) / 2

    m = np.zeros((bboxes.shape[0], predbboxes.shape[0]))

    for i, p1 in enumerate(a):
        for j, p2 in enumerate(b):
            d = np.sqrt(np.sum((p1 - p2)**2))
            m[i, j] = d

    bbox_ids, pred_ids = hungarian(m)
    # print(predbboxes.shape, pred_ids.shape)
    return bbox_ids, predbboxes[pred_ids, 4]
예제 #11
0
def evaluator(gtr,
              det_full,
              parts_name={
                  0: 'Tail',
                  1: 'Head',
                  2: 'Torax',
                  3: 'Rant',
                  4: 'Lant'
              },
              threshold=50,
              scale=1):
    # Ground truth as an array
    nparts = len(parts_name.keys())
    ground = {}
    ground_mappings = {}
    for fr in gtr['annotations']:
        ground[fr['image_id']] = []
        ground_mappings[fr['image_id']] = []
    for fr in gtr['annotations']:
        if nparts == 3:
            ground_mappings[fr['image_id']].append([
                fr['keypoints'][:2], fr['keypoints'][4:6],
                distance(fr['keypoints'][:2], fr['keypoints'][2:4])
            ])
        else:
            ground_mappings[fr['image_id']].append([
                fr['keypoints'][:2], fr['keypoints'][2:4],
                distance(fr['keypoints'][:2], fr['keypoints'][2:4])
            ])

        for k in range(0, nparts * 2 - 1, 2):
            ground[fr['image_id']].append(
                [fr['keypoints'][k], fr['keypoints'][k + 1]])

    # Detection keys as in ground truth
    detections = {}
    mappings = {}
    for key in det_full.keys():
        if key == 'runningtime':
            continue
        detections[int(key.split("/")[-1].split(".")[0])] = np.array(
            det_full[key]['detections']) * scale
        temp = det_full[key]['mapping']
        mappings[int(key.split("/")[-1].split(".")[0])] = []
        for t in temp:
            mappings[int(key.split("/")[-1].split(".")[0])].append([
                np.array([t[1][0], t[0][0]]) * scale,
                np.array([t[1][1], t[0][1]]) * scale, t[2], t[3]
            ])
    # Evaluations
    evaluations = {}
    retrieval = {}

    detect = detections
    for k in ground.keys():
        if k not in detect.keys():
            continue
        evaluations[k] = {}
        retrieval[k] = {}
        gt = ground[k]
        gt_map = ground_mappings[k]
        dt_map = mappings[k]
        dt = detect[k]
        cm = cost_matrix(gt, dt, threshold)
        gt_idx, dt_idx = hungarian(cm)
        cm_map = cost_matrix_mappings(ground_mappings[k], mappings[k],
                                      threshold)
        gmap_idx, dmap_idx = hungarian(cm_map)
        assignments_cost = cm[gt_idx, dt_idx]
        assignments_cost_map = cm_map[gmap_idx, dmap_idx]
        item = evaluations[k]
        item['gt_total_parts'] = len(gt)
        item['gt_total_maps'] = len(gt_map)
        item['dt_total_parts'] = len(dt)
        item['dt_total_maps'] = len(dt_map)
        item['gt_individuals'] = len(gt) // nparts
        item['dt_individuals'] = len(dt) // nparts
        item['cumulative_error'] = cm[gt_idx[:len(gt)], dt_idx[:len(gt)]].sum()
        item['total_avg_error'] = cm[gt_idx[:len(gt)], dt_idx[:len(gt)]].mean()
        item['total_std_error'] = cm[gt_idx[:len(gt)], dt_idx[:len(gt)]].std()
        item['cumulative_error_map'] = cm[gmap_idx[:len(gt_map)],
                                          dmap_idx[:len(gt_map)]].sum()
        item['avg_error_map'] = cm[gmap_idx[:len(gt_map)],
                                   dmap_idx[:len(gt_map)]].mean()
        item['std_error_map'] = cm[gmap_idx[:len(gt_map)],
                                   dmap_idx[:len(gt_map)]].std()
        #retrieval[k]['cost_matrix'] =cm
        #retrieval[k]['cost_matrix_map'] =cm_map
        #retrieval[k]['cost_matrix_assignments'] = assignments_cost
        #retrieval[k]['cost_matrix_assignments_map'] = assignments_cost_map

        retrieval[k]['dt_idx'] = dt_idx
        for i in range(nparts):
            temp = np.array([
                1 if assignments_cost[j] < threshold else 0
                for j in range(i, len(gt), nparts)
            ])
            item[
                parts_name[i] +
                ' score'] = 100 * temp.sum() / evaluations[k]['gt_individuals']
            item[parts_name[i] + ' total'] = temp.sum()

        temp = np.array([
            1 if assignments_cost_map[j] < threshold * 2 else 0
            for j in range(len(gt_map))
        ])
        item['matching_score'] = 100 * temp.sum() / len(gt_map)
        item['matching_total'] = temp.sum()
    retrieval['ground'] = ground
    retrieval['ground_mappings'] = ground_mappings
    retrieval['detections'] = detections
    retrieval['mappings'] = mappings
    return evaluations, retrieval
예제 #12
0
    def recognise(self, frameNumber, detections, bbox, verbose=False):
        """
        Update tracker with new measurements.
        This is done by calculating a pairwise distance matrix and finding the optimal solution through the Hungarian algorithm.
        
        Input: 
            frameNumber: The current frame number (Int)
            detections: List of cv2.keyPoints (the detections) found in the current frame.
            bbox: List of dicts containing bounding boxes associated with the detected keypoints. 
            frame: The current frame as numpy array
            labels: Grayscale image where each BLOB has pixel value equal to its label 
            verbose: Whether to print information or not.
            
        Output:
            tracks: List of Track objects
        """

        for idx in reversed(range(len(bbox))):
            if bbox[idx]["confidence"] < self.minConfidence:
                del bbox[idx]
                del detections[idx]

        self.detections = detections

        # Update tracking according to matches
        numNew = len(self.detections)
        numOld = len(self.tracks)
        if (verbose):
            print("New detections: ", numNew)
            print("Existing tracks: ", numOld)

        for t in self.tracks:
            if verbose:
                print("ID {} - Kill Count {}".format(t.id, t.killCount))
            t.killCount += 1

        # Construct cost matrix
        costM = self.pairwiseDistance(detections, self.tracks)

        row_ind, col_ind = hungarian(costM)
        matches = [(row_ind[i], col_ind[i]) for i in range(row_ind.shape[0])]

        killedTracks = []
        for (mRow, pCol) in matches:
            ## If the assignment cost is below the Ghost threshold, then update the existing tracklet
            if (costM[mRow][pCol] < self.ghostThreshold):
                # Update existing track with measurement
                p = np.array(detections[mRow].pt)
                self.tracks[pCol].pos.append(p)
                self.tracks[pCol].bbox.append(
                    self.convertBBoxtoList(bbox[mRow]))
                self.tracks[pCol].M = self.matrixInverse(bbox[mRow]["cov"])
                self.tracks[pCol].mean = bbox[mRow]["mean"]
                self.tracks[pCol].frame.append(frameNumber)
                self.tracks[pCol].killCount = 0

            ## If the cost assignment is higher than the ghost threshold, then either create a new track or kill an old one
            else:
                # A new track is created if the following is true:
                # 1) The cost (L2 distance) is higher than the ghost threshold
                # 2) It is an actual detection (mRow < numNew)
                if (mRow < numNew):
                    # Create new track
                    newTrack = Track()
                    p = np.array(detections[mRow].pt)
                    newTrack.pos.append(p)
                    newTrack.bbox.append(self.convertBBoxtoList(bbox[mRow]))
                    newTrack.M = self.matrixInverse(bbox[mRow]["cov"])
                    newTrack.mean = bbox[mRow]["mean"]
                    newTrack.frame.append(frameNumber)
                    newTrack.id = self.trackCount
                    self.trackCount += 1
                    self.tracks.append(newTrack)

                    if verbose:
                        print("Num tracks: {}".format(len(self.tracks)))

                # The track is deleted if the following is true:
                # 1) The assigned detection is a dummy detection (mRow >= numNew),
                # 2) There are more tracks than detections (numOld > numNew)
                # 3) The assigned track is a real track (pCol < numOld)
                elif (numOld > numNew and pCol < numOld):
                    if (self.tracks[pCol].killCount > self.maxKillCount):
                        killedTracks.append(pCol)

                        if verbose:
                            print("Num tracks: {}".format(len(self.tracks)))

        for pCol in sorted(killedTracks, reverse=True):
            self.oldTracks.append(self.tracks.pop(pCol))

        del (costM)
        if verbose:
            print()
예제 #13
0
def gen_assignment_hungarian(trees_a, trees_b):
    cost = np.array([[tree_b.cost(tree_a) for tree_b in trees_b]
                     for tree_a in trees_a])
    row_ind, col_ind = hungarian(cost)
    pairs = [(trees_a[r], trees_b[c]) for r, c in zip(row_ind, col_ind)]
    return pairs
예제 #14
0
def hungarian_tracking(detections_path, cost, output='', part=str(2)):
    """
    This function executes the hungarian algorithm. It is expecting to receive an instance of detections. Please check documentation for data structure. It will also consider the maximum distance and it outputs in a new file with the data structure explained in documentation. 
    
    Inputs: 
        - detections_path: Path to find detections file
        - cost : Maximum distance allowed
        - output: Optional, if '', will use the same path as were the detections are. 
        - part : Over what part perform the tracking. By default thorax or '2'
    
    """
    if output == '':
        output = detections_path
    detections = read_json(detections_path)
    keylist = list(detections.keys())
    tracks = {}
    final_tracks = {}
    cmaps = {}
    tracks = np.zeros((len(keylist), 70))
    key = keylist[0]
    parts = detections[key]['parts'][part]
    boxes = dets2boxes(parts, size=20)
    parts = boxes2dets(non_max_suppression_slow(boxes, 0.6)[::-1])
    track_id = {}
    for j in range(len(parts)):
        tracks[0][j] = j + 1
        track_id[j + 1] = {}
        track_id[j + 1]['init_frame'] = 0
        track_id[j + 1]['cost'] = [0]
        track_id[j + 1]['positions'] = [parts[j]]
    for i in range(len(keylist) - 1):
        key = keylist[i]
        key_next = keylist[i + 1]
        parts = detections[key]['parts'][part]
        boxes = dets2boxes(parts, size=20)
        parts = boxes2dets(non_max_suppression_slow(boxes, 0.6)[::-1])
        parts_next = detections[key_next]['parts'][part]
        boxes = dets2boxes(parts_next, size=20)
        parts_next = boxes2dets(non_max_suppression_slow(boxes, 0.6)[::-1])
        cmap = cost_matrix_tracks(parts, parts_next, cost)
        cmaps[key] = cmap
        _, idx = hungarian(cmap)
        for j in range(len(parts)):
            if cmap[j, idx[j]] < cost:

                if tracks[i][j] == 0:
                    #create new track for detection j at frame i
                    tracks[i][j] = len(track_id.keys()) + 1

                    k = int(tracks[i][j])  # id of the track
                    track_id[k] = {
                        'init_frame': i,
                        'cost': [0],
                        'positions': [parts[j]]
                    }

                tracks[i + 1][idx[j]] = tracks[i][j]
                track_id[int(tracks[i + 1][idx[j]])]['cost'].append(
                    cmap[j, idx[j]])
                track_id[int(tracks[i + 1][idx[j]])]['positions'].append(
                    parts_next[idx[j]])

    for k in track_id.keys():
        track_id[k]['mean'] = np.array(track_id[k]['cost']).mean()

    new_tracks = {}
    new_tracks['frames'] = {}
    new_tracks['data'] = {}
    for f in range(len(keylist)):
        det = detections[keylist[f]]['parts'][part]
        mapping = detections[keylist[f]]['mapping']
        new_tracks['frames'][f] = []
        #for j in range(max(tracks[i])):
        for i in range(len(det)):

            if int(tracks[f][i]) == 0:
                continue

            else:
                angle = find_angle(det[i], mapping)
                new_tracks['frames'][f].append(tracks[f][i])
                new_tracks['data'][tracks[f][i]] = {
                    'frame': f,
                    'id': tracks[f][i],
                    'location': det[i],
                    'init_frame': track_id[int(tracks[f][i])]['init_frame'],
                    'positions': track_id[int(tracks[f][i])]['positions'],
                    'angle': angle
                }
    print('saving  trackings')
    folder = '/'.join(output.split('/')[:-1])
    output1 = os.path.join(folder, 'track_nms_v2_' + output.split('/')[-1])
    with open(output1, 'w') as outfile:
        json.dump(new_tracks, outfile, cls=NumpyEncoder)
    output2 = os.path.join(folder, 'id_nms_track_' + output.split('/')[-1])
    with open(output2, 'w') as outfile:
        json.dump(track_id, outfile, cls=NumpyEncoder)

    output3 = os.path.join(folder, 'track_nms_' + output.split('/')[-1])
    with open(output3, 'w') as outfile:
        json.dump(tracks, outfile, cls=NumpyEncoder)
예제 #15
0
def calc_frechet_mean(D, r, k, verbose):
    # computes the weighted frechet mean of D with weights r[.][k]
    # inputs: diagrams D, membership values r, centre index k, verbose
    # returns: weighted frechet mean y, optimal pairings x

    n = len(D)
    m = len(D[0])
    # initialise to random diagram in D
    random.seed(0)
    M_update = D[random.randint(0, n - 1)]

    # first run to find matching
    matching = []
    for j in range(n):
        c = calc_cost_matrix(M_update, D[j])
        x_indices = hungarian(c)
        matching.append(x_indices)

    # loop until stopping condition is found
    counter2 = 0

    while True:
        counter2 += 1

        # update matched points
        x = np.zeros((n, m, 2))
        for j in range(n):
            for i in range(m):
                index = matching[j][1][i]
                x[j][i] = D[j][index]

        # generate y to return
        y = np.zeros((m, 2))

        # loop over each point
        for i in range(m):
            # calculate w and w_\Delta
            r2_od = 0
            r2x_od = [0, 0]
            for j in range(n):
                if x[j][i][0] != -1:
                    r2_od += r[j][k]**2
                    r2x_od[0] += r[j][k]**2 * x[j][i][0]
                    r2x_od[1] += r[j][k]**2 * x[j][i][1]

            # if all points are diagonals
            if r2_od == 0:
                # then y[i] is a diagonal
                y[i] = [-1, -1]

            # else proceed
            else:
                w = [r2x_od[0] / r2_od, r2x_od[1] / r2_od]
                w_delta = [(w[0] + w[1]) / 2, (w[0] + w[1]) / 2]

                r2_d = 0
                r2_w_delta = [0, 0]
                for j in range(n):
                    if x[j][i][0] == -1:
                        r2_d += r[j][k]**2
                        r2_w_delta[0] += r[j][k]**2 * w_delta[0]
                        r2_w_delta[1] += r[j][k]**2 * w_delta[1]

                # calculate weighted mean
                y[i][0] = (r2x_od[0] + r2_w_delta[0]) / (r2_od + r2_d)
                y[i][1] = (r2x_od[1] + r2_w_delta[1]) / (r2_od + r2_d)

        old_matching = matching.copy()
        matching = []
        for j in range(n):
            c = calc_cost_matrix(y, D[j])
            x_indices = hungarian(c)
            matching.append(x_indices)

        comparison = (np.array(matching) == np.array(old_matching))
        if comparison.all():
            if verbose:
                print("      Frechet iterations for M_" + str(k) + ": " +
                      str(counter2))
            return y, x
예제 #16
0
    def fit(self, data: pd.DataFrame, seed: Optional[int] = None):
        # Given a d-dimensional random vector x and its (d,n) observed data matrix X,
        # apply an ICA algorithm to obtain an estimate of A.
        d = len(data.columns)
        B = FastICA(random_state=seed).fit(data).components_
        # Find the unique permutation of the rows of W = A^-1 that yields a matrix W'
        # without any zeros on the main diagonal. The permutation is sought by minimizing
        # sum_i (1/|W'_ii|). This minimization problem is the classical linear assignment
        # problem, and here the Hungarian algorithm (Kuhn, 1955) is used.
        _, K = hungarian(1 / np.abs(B))
        B = B.take(K, 0)
        # Divide each row of W' by its corresponding diagonal element in order to
        # yield a new matrix W'' with a diagonal consisting entirely of 1s.
        B /= B.diagonal()[..., None]
        # Compute an estimate B' of B by using B' = I - W''.
        B = np.identity(d) - B
        # Finally, to estimate a causal order k(i), determine the permutation matrix
        # K of B', obtaining the matrix B' = PB'K^T that is as close as possible
        # to having a strictly lower triangular structure.
        K = None
        if d < 8:
            # For a small number of variables, i.e., fewer than 8, the lower triangularity
            # of B' can be measured by using the sum of squared bij in its upper triangular
            # section sum_i<=j (b'_ij^2). In addition, an exhaustive search over all possible
            # permutations is feasible and is hence performed.
            vmin = np.inf
            for p in permutations(range(d)):
                score = np.sum(np.square(np.triu(B.take(p, 0))))
                if score < vmin:
                    vmin = score
                    K = p
            K = np.array(K)
        else:
            # For higher-dimensional data, the following approximate algorithm is used,
            # which sets small absolute valued elements in B' to zero, and whereby it can be
            # determined whether it is possible to permute the resulting matrix to become
            # strictly lower triangular:
            # (a) Set the d(d+1)/2 smallest (in absolute value) elements of B' to zero.
            i = round(d * (d + 1) / 2)
            pmin = np.argsort(np.abs(np.ravel(B)))
            B.flat[pmin[:i]] = 0
            # (b) Repeat
            while K is None:
                # i. Determine whether B' can be permuted to become strictly lower triangular.
                # If this is possible, stop and return the permuted B'.
                K, A, L = np.zeros(d, int), np.arange(d), B
                while len(A) > 0:
                    # Find a row where all elements are zero, if any.
                    j = np.where(np.sum(np.abs(L), axis=1) == 0)
                    # If there is no row with zero elements, exit.
                    if len(j[0]) == 0:
                        K = None
                        break
                    # Select the first row with zero elements.
                    j = j[0][0]
                    # Add original index to permutation matrix.
                    K[d - len(A)] = A[j]
                    A = np.delete(A, j)
                    # Remove selected row and columns.
                    mask = np.delete(np.arange(len(L)), j)
                    L = L[mask][:, mask]
                # ii. In addition, set the next smallest (in absolute value) element of Bb to zero.
                if K is None:
                    B.flat[pmin[i]] = 0
                    i += 1

        return K
예제 #17
0
    def _run_algorithm(self,
                       m_matrix,
                       m_obs_matrix,
                       window,
                       tags_by_popularity,
                       brute_force_size=10):
        """Runs the generalized count attack using the brute force method and Hoefding bounds.
        Returns a dictionary that maps tag_ids to their assigned keywords (query_predictions_for_each_tag)
        Returns 0 instead if there is a global inconsistency"""
        def count_disambiguations(tags0, kws0, candidate_keywords_per_tag,
                                  n_kws, n_tags):
            """Receives an initial (fixed) assignment of tags to keywords and computes how many disambiguations are solved
            current_map is a dictionary mapping tag to sets of candidate keywords"""

            current_map = candidate_keywords_per_tag.copy()
            for tag_id, kw_id in zip(tags0, kws0):
                current_map[tag_id] = [kw_id]

            ambiguous_tags = [
                tag_id for tag_id, candidates in current_map.items()
                if len(candidates) > 1
            ]
            known_tags = [
                tag_id for tag_id, candidates in current_map.items()
                if len(candidates) == 1
            ]
            fresh_pass = False
            while not fresh_pass and len(ambiguous_tags) > 1:
                fresh_pass = True
                for tag_id in ambiguous_tags:
                    current_candidates = current_map[tag_id]
                    new_candidates = []
                    for candidate_kw_id in current_candidates:
                        check = [
                            m_matrix[current_map[known_tag]
                                     [0]][candidate_kw_id] - window <=
                            m_obs_matrix[known_tag][tag_id] <=
                            m_matrix[current_map[known_tag]
                                     [0]][candidate_kw_id] + window
                            for known_tag in known_tags
                        ]
                        if all(check):
                            new_candidates.append(candidate_kw_id)

                    if len(current_candidates) > len(new_candidates):
                        fresh_pass = False
                        current_map[tag_id] = new_candidates
                        # print("  Removed {:d} candidates".format(len(current_candidates) - len(new_candidates)))

                for tag_id in ambiguous_tags:
                    if len(current_map[tag_id]) == 1:
                        ambiguous_tags.remove(tag_id)
                        known_tags.append(tag_id)
                        # print("  tag {:d} is disambiguated".format(tag_id))
                        for tag_id_others in ambiguous_tags:
                            if tag_id != tag_id_others and current_map[tag_id][
                                    0] in current_map[tag_id_others]:
                                current_map[tag_id_others].remove(
                                    current_map[tag_id][0])
                    elif len(current_map[tag_id]) == 0:
                        # print("  Inconsistency!")
                        return 0, None  # Inconsistency

            n_disambiguations = len(known_tags)

            cost_matrix = np.ones((n_kws, n_tags))
            for i_tag in range(n_tags):
                for candidate_kw_id in current_map[i_tag]:
                    cost_matrix[candidate_kw_id, i_tag] = 0
            row_ind, col_ind = hungarian(cost_matrix)
            if cost_matrix[row_ind, col_ind].sum() > 0:
                # print("  There was no consistent matching!")
                return 0, None

            query_predictions_for_each_tag = {}
            for tag, keyword in zip(col_ind, row_ind):
                query_predictions_for_each_tag[tag] = keyword

            # print("  This matching has {:d} disambiguations, returning...".format(n_disambiguations))
            return n_disambiguations, query_predictions_for_each_tag

        assert len(tags_by_popularity) >= brute_force_size

        # Build candidate sets per tag
        candidate_keywords_per_tag = {}
        for tag_id in range(self.n_tags):
            kw_list = [
                kw_id for kw_id in range(self.n_keywords)
                if m_matrix[kw_id, kw_id] - window <= m_obs_matrix[
                    tag_id, tag_id] <= m_matrix[kw_id, kw_id] + window
            ]
            if len(kw_list) == 0:
                # print("  tag_{:d} had zero candidates, aborting...".format(tag_id))
                return None
            candidate_keywords_per_tag[tag_id] = kw_list
        # print("LIST OF CANDIDATE KEYWORDS")
        # for tag_id in range(self.n_tags):
        #     print("{:d}: len={:d}".format(tag_id, len(candidate_keywords_per_tag[tag_id])))

        # Select brute-force sets to test
        candidate_sets_chosen = [
            candidate_keywords_per_tag[tag_id]
            for tag_id in tags_by_popularity[:brute_force_size]
        ]
        aux_combinations = list(itertools.product(*candidate_sets_chosen))

        all_combinations_to_test = [
            combination for combination in aux_combinations
            if len(combination) == len(set(combination))
        ]
        if len(all_combinations_to_test) == 0:
            return None
        # print("There are {:d} combinations to test".format(len(all_combinations_to_test)))

        # Compute number of disambiguations in each of those sets
        test_results = [
            count_disambiguations(tags_by_popularity[:brute_force_size],
                                  combination, candidate_keywords_per_tag,
                                  self.n_keywords, self.n_tags)
            for combination in all_combinations_to_test
        ]

        test_results.sort(key=lambda x: x[0], reverse=True)

        # Choose output:
        if test_results[0][
                1] is not None:  # If one of these brute-forced matchings was feasible:
            # print("Found consistent mapping with {:d} disambiguated queries".format(test_results[0][0]))
            return test_results[0][1]
        else:
            # Ensure there is at least one feasible assignment with volumes
            cost_matrix = np.ones((self.n_keywords, self.n_tags))
            for i_tag in range(self.n_tags):
                for candidate_kw_id in candidate_keywords_per_tag[i_tag]:
                    cost_matrix[candidate_kw_id, i_tag] = 0
            row_ind, col_ind = hungarian(cost_matrix)
            if cost_matrix[row_ind, col_ind].sum() > 0:
                # print("Could not find any consistent mapping at all...")
                return None
            else:
                # print("Could not find any mapping consistent with co-occurences, returning one that is consistent with volumes...")
                feasible_assignment = {}
                for tag, keyword in zip(col_ind, row_ind):
                    feasible_assignment[tag] = keyword
                return feasible_assignment