def get_hashed_label_weight(label_mapping, b): features, labels, num_samples, num_features, num_labels = data_utils.read_data( train_file) instance, labels_flatten = labels.nonzero() mapped_labels = label_mapping[labels_flatten] m_count = Counter(mapped_labels) m_count_tensor = torch.zeros(b) for k, v in m_count.items(): m_count_tensor[k] = v total = m_count_tensor.sum() rest = total-m_count_tensor m_count_tensor[m_count_tensor == 0] = m_count_tensor[m_count_tensor != 0] .min() # m_count_tensor[m_count_tensor == 0] = float("-inf") w = rest / m_count_tensor # w = w / w.min() return w
def load_small_data(full_data_path, tr_path, tst_path): features, labels, num_samples, num_features, num_labels = data_utils.read_data( full_data_path) labels = labels.toarray() features = features.toarray() dum_feature = np.zeros((1, num_features)) dum_label = np.zeros((1, num_labels)) features = np.concatenate((dum_feature, features), axis=0) labels = np.concatenate((dum_label, labels), axis=0) train_indices = pd.read_csv(tr_path, sep=" ", header=None) tr_indices = train_indices.to_numpy() train_X = features[tr_indices[:, 0]] train_Y = labels[tr_indices[:, 0]] test_indices = pd.read_csv(tst_path, sep=" ", header=None) tst_indices = train_indices.to_numpy() test_X = features[tr_indices[:, 0]] test_Y = labels[tr_indices[:, 0]] return train_X, train_Y, test_X, test_Y
def get_discard_set(filepath, type, rate): assert type in ['cumsum', 'rank'] # count labels -> where to? only in train features, labels, num_samples, num_features, num_labels = data_utils.read_data( filepath) # get labels with few instances instance, labels_flatten = labels.nonzero() count = Counter(labels_flatten) count_np = np.zeros(num_labels).astype(np.int32) for k, v in count.items(): count_np[k] = v idx = np.argsort(count_np) sorted_count = np.sort(count_np) if type == 'cumsum': percentile = np.cumsum(sorted_count) / sorted_count.sum() discard_sets = [set(idx[np.nonzero(percentile < r)]) for r in rate] elif type == 'rank': discard_sets = [set(idx[0:int(len(idx) * r)]) for r in rate] return discard_sets, count_np
cuda = torch.cuda.is_available() R = model_cfg['r'] b = model_cfg['b'] num_labels = data_cfg["num_labels"] dest_dim = model_cfg['dest_dim'] name = data_cfg['name'] prefix = data_cfg['prefix'] record_dir = data_cfg["record_dir"] data_dir = os.path.join("data", name) label_path = os.path.join(record_dir, "_".join( [prefix, str(num_labels), str(b), str(R)])) # Bibtex_159_100_32 test_file = os.path.join(data_dir, name + "_test.txt") features, labels, num_samples, num_features, num_labels = data_utils.read_data( test_file) instance, labels_flatten = labels.nonzero() count = Counter(labels_flatten) unsorted_count = np.zeros(num_labels, dtype=np.int) for k, v in count.items(): unsorted_count[k] = v sorted_idx = np.flip(np.argsort(unsorted_count)) thres = [5, 10, 20] for r in tqdm.tqdm(range(R)): # use feature hashing to map back counts, label_mapping, inv_mapping = get_label_hash(label_path, r) mapped_labels = label_mapping[labels_flatten] m_count = Counter(mapped_labels) zero_count = b - len(m_count) # hashed label -> hashed count
def loadData(cls, file_path): X, labels, _, _, _ = data_utils.read_data(file_path) return X, labels
# load dataset test_file = os.path.join(data_dir, prefix + "_test.txt") label_path = os.path.join(record_dir, "_".join( [prefix, str(num_labels), str(b), str(R)])) # Bibtex_159_100_32 pred_avg_meter = AverageMeter() logging.info("Evaluating mAP only config %s" % (a.model)) logging.info("Dataset config %s" % (a.dataset)) if a.cost: logging.info("Evaluating cost-sensitive method: %s" % (a.cost)) # get inverse propensity _, labels, _, _, _ = data_utils.read_data(test_file) inv_propen = xc_metrics.compute_inv_propesity(labels, model_cfg["ps_A"], model_cfg["ps_B"]) ap_meter = meter.APMeter() a.__dict__['rep'] = 0 single_model_dir = get_model_dir(data_cfg, model_cfg, a) gt_filename = os.path.join(single_model_dir, "gt.npz") gt = scipy.sparse.load_npz(gt_filename).tocsc() # get label mappings l_maps = [] for r in range(R): counts, label_mapping, inv_mapping = get_label_hash(label_path, r) l_maps.append(label_mapping) l_maps = np.stack(l_maps, axis=0) # R x #labels lfu = cachetools.LRUCache(R * a.bs * a.cs)
from xclib.data import data_utils import xclib.evaluation.xc_metrics as xc_metrics import numpy as np dataset = 'eurlex' # Read file with features and labels features, labels, num_samples, num_features, num_labels = data_utils.read_data( 'data/' + dataset + '/' + 'train.txt') A, B = 0.55, 1.5 inv_propen = xc_metrics.compute_inv_propesity(labels, A, B) np.savetxt('inv_prop.txt', inv_propen) data_utils.write_sparse_file(features, "trn_X_Xf.txt") data_utils.write_sparse_file(labels, "trn_X_Y.txt") features, labels, num_samples, num_features, num_labels = data_utils.read_data( 'data/' + dataset + '/' + 'test.txt') data_utils.write_sparse_file(features, "tst_X_Xf.txt") data_utils.write_sparse_file(labels, "tst_X_Y.txt")
def loadData(cls, file_path): X, labels, _, _, _ = data_utils.read_data(file_path) X = normalize(X) return X, labels
def get_matrix_from_txt(path, isSparse): if (isSparse): labels = data_utils.read_sparse_file('trn_X_Xf.txt', force_header=True) features, labels, num_samples, num_features, num_labels = data_utils.read_data( path) return features.toarray(), labels.toarray().astype(int)
def load_rcv_data(path): features, labels, num_samples, num_features, num_labels = data_utils.read_data( path) return features, labels
def get_matrix_from_txt(path): features, labels, num_samples, num_features, num_labels = data_utils.read_data( path) return features.toarray(), labels.toarray().astype('int')