def predict( self, X, csr_codes=None, only_topk=None, cond_prob=True, normalize=False, **arg_kw, ): assert csr_codes is not None, "csr_codes must be provided for CountModel.prdict)" assert csr_codes.shape[0] == X.shape[0] assert csr_codes.shape[1] == self.nr_codes if cond_prob: pred_csr = csr_codes.dot(self.code_to_label).tocsr() else: tmp = csr_codes.data tmp2 = sp.ones_like(tmp) csr_codes.data = tmp2 pred_csr = csr_codes.dot(self.code_to_label).tocsr() csr_codes.data = tmp pred_csr = smat_util.sorted_csr(pred_csr, only_topk=only_topk) if normalize: pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm="l1") return pred_csr
def rank_average(*args): CsrEnsembler.check_validlity(*args) mm = max((x.indptr[1:] - x.indptr[:-1]).max() for x in args) ret = sum(smat_util.get_relevance_csr(csr, mm) for csr in args) ret = smat_util.sorted_csr(ret) ret.data /= len(args) return ret
def predict_new( self, X, only_topk=None, csr_codes=None, cond_prob=None, normalized=False, threads=-1, ): assert X.shape[1] == self.nr_features if csr_codes is None: dense = X.dot(self.W).toarray() if cond_prob: dense = cond_prob.transform(dense, inplace=True) coo = smat_util.dense_to_coo(dense) pred_csr = smat_util.sorted_csr_from_coo(coo.shape, coo.row, coo.col, coo.data, only_topk=only_topk) else: # csr_codes is given assert self.C is not None, "This model does not have C" assert X.shape[1] == self.nr_features assert csr_codes.shape[0] == X.shape[0] assert csr_codes.shape[1] == self.nr_codes if not csr_codes.has_sorted_indices: csr_codes = csr_codes.sorted_indices() if (csr_codes.data == 0).sum() != 0: # this is a trick to avoid zero entries explicit removal from the smat_dot_smat offset = sp.absolute(csr_codes.data).max() + 1 csr_codes = smat.csr_matrix( (csr_codes.data + offset, csr_codes.indices, csr_codes.indptr), shape=csr_codes.shape, ) pZ = PyMatrix.init_from(csr_codes, self.dtype) csr_labels, pred_csr = clib.multilabel_predict_with_codes( X, self.pW, self.pC, pZ, threads=threads) csr_labels.data -= offset else: pZ = PyMatrix.init_from(csr_codes.sorted_indices(), self.dtype) csr_labels, pred_csr = clib.multilabel_predict_with_codes( X, self.pW, self.pC, pZ, threads=threads) val = pred_csr.data if cond_prob: val = cond_prob.transform(val, inplace=True) val = cond_prob.combiner(val, csr_labels.data) pred_csr = smat_util.sorted_csr(pred_csr, only_topk=only_topk) if normalized: pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm="l1") return pred_csr
def round_robin(*args): CsrEnsembler.check_validlity(*args) base = 1.0 / (len(args) + 1.0) mm = max((x.indptr[1:] - x.indptr[:-1]).max() for x in args) ret = smat_util.get_relevance_csr(args[0], mm) ret.data[:] += len(args) * base for i, x in enumerate(args[1:], 1): tmp = smat_util.get_relevance_csr(x, mm) tmp.data[:] += (len(args) - i) * base ret = ret.maximum(tmp) ret = smat_util.sorted_csr(ret) ret.data /= len(args) return ret
def load_prediction(path_to_file, only_topk=None): with open(path_to_file, 'r') as fin: nr_insts, nr_labels = [ int(x) for x in fin.readline().strip().split() ] coo = smat_util.coo_appender((nr_insts, nr_labels)) for i in range(nr_insts): for iv in fin.readline().strip().split(): iv = iv.split(':') j = int(iv[0]) v = float(iv[1]) coo.append(i, j, v) return smat_util.sorted_csr(coo.tocsr(), only_topk=only_topk)
def predict(self, X, only_topk=None, transform=None, csr_codes=None, cond_prob=None, normalized=False, threads=-1): assert X.shape[1] == self.nr_features if csr_codes is None: dense = X.dot(self.W).toarray() if transform: dense = transform(dense, inplace=True) coo = smat_util.dense_to_coo(dense) pred_csr = smat_util.sorted_csr_from_coo(coo.shape, coo.row, coo.col, coo.data, only_topk=only_topk) else: # csr_codes is given assert self.C is not None, "This model does not have C" assert X.shape[1] == self.nr_features assert csr_codes.shape[0] == X.shape[0] assert csr_codes.shape[1] == self.nr_codes csr_labels = (csr_codes.dot(self.C.T)).tocsr() nnz_of_insts = csr_labels.indptr[1:] - csr_labels.indptr[:-1] inst_idx = sp.repeat(sp.arange(X.shape[0], dtype=sp.uint32), nnz_of_insts) label_idx = csr_labels.indices.astype(sp.uint32) val = self.predict_values(X, inst_idx, label_idx, threads=threads) if transform: val = transform(val, inplace=True) if cond_prob: val[:] = cond_prob.combiner(val, csr_labels.data) pred_csr = smat.csr_matrix((val, label_idx, csr_labels.indptr), shape=csr_labels.shape) pred_csr = smat_util.sorted_csr(pred_csr, only_topk=only_topk) #pred_csr = self.predict_with_coo_labels(X, coo_labels.row, coo_labels.cols, only_topk) if normalized: pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm='l1') return pred_csr
def generate(cls, tY, pY, topk=10): assert isinstance(tY, smat.csr_matrix), type(tY) assert isinstance(pY, smat.csr_matrix), type(pY) assert tY.shape == pY.shape, "tY.shape = {}, pY.shape = {}".format( tY.shape, pY.shape) pY = smat_util.sorted_csr(pY) total_matched = sp.zeros(topk, dtype=sp.uint64) recall = sp.zeros(topk, dtype=sp.float64) for i in range(tY.shape[0]): truth = tY.indices[tY.indptr[i]:tY.indptr[i + 1]] matched = sp.isin(pY.indices[pY.indptr[i]:pY.indptr[i + 1]][:topk], truth) cum_matched = sp.cumsum(matched, dtype=sp.uint64) total_matched[:len(cum_matched)] += cum_matched recall[:len(cum_matched)] += cum_matched / len(truth) if len(cum_matched) != 0: total_matched[len(cum_matched):] += cum_matched[-1] recall[len(cum_matched):] += cum_matched[-1] / len(truth) prec = total_matched / tY.shape[0] / sp.arange(1, topk + 1) recall = recall / tY.shape[0] return cls(prec=prec, recall=recall)
def predict_new(self, X, only_topk=None, transform=None, csr_codes=None, cond_prob=None, normalized=False, threads=-1): assert X.shape[1] == self.nr_features if csr_codes is None: dense = X.dot(self.W).toarray() if transform: dense = transform(dense, inplace=True) coo = smat_util.dense_to_coo(dense) pred_csr = smat_util.sorted_csr_from_coo(coo.shape, coo.row, coo.col, coo.data, only_topk=only_topk) else: # csr_codes is given assert self.C is not None, "This model does not have C" assert X.shape[1] == self.nr_features assert csr_codes.shape[0] == X.shape[0] assert csr_codes.shape[1] == self.nr_codes pZ = PyMatrix.init_from(csr_codes, self.dtype) csr_labels, pred_csr = clib.multilabel_predict_with_codes( X, self.pW, self.pC, pZ, threads=threads) val = pred_csr.data if transform: val = transform(val, inplace=True) if cond_prob: val[:] = cond_prob.combiner(val, csr_labels.data) pred_csr = smat_util.sorted_csr(pred_csr, only_topk=only_topk) if normalized: pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm='l1') return pred_csr
def average(*args): CsrEnsembler.check_validlity(*args) ret = sum(args) ret = smat_util.sorted_csr(ret) ret.data /= len(args) return ret
def get_optimal_codes(Y, C, only_topk=None): csr_codes = smat_util.sorted_csr(Y.dot(C).tocsr(), only_topk=only_topk) csr_codes = sk_normalize(csr_codes, axis=1, copy=False, norm='l1') return csr_codes