예제 #1
0
def get_hashed_label_weight(label_mapping, b):
    features, labels, num_samples, num_features, num_labels = data_utils.read_data(
        train_file)
    instance, labels_flatten = labels.nonzero()
    mapped_labels = label_mapping[labels_flatten]
    m_count = Counter(mapped_labels)
    m_count_tensor = torch.zeros(b)
    for k, v in m_count.items():
        m_count_tensor[k] = v
    total = m_count_tensor.sum()
    rest = total-m_count_tensor
    m_count_tensor[m_count_tensor ==
                   0] = m_count_tensor[m_count_tensor != 0] .min()
    # m_count_tensor[m_count_tensor == 0] = float("-inf")
    w = rest / m_count_tensor
    # w = w / w.min()
    return w
예제 #2
0
def load_small_data(full_data_path, tr_path, tst_path):
    features, labels, num_samples, num_features, num_labels = data_utils.read_data(
        full_data_path)
    labels = labels.toarray()
    features = features.toarray()
    dum_feature = np.zeros((1, num_features))
    dum_label = np.zeros((1, num_labels))
    features = np.concatenate((dum_feature, features), axis=0)
    labels = np.concatenate((dum_label, labels), axis=0)
    train_indices = pd.read_csv(tr_path, sep=" ", header=None)
    tr_indices = train_indices.to_numpy()
    train_X = features[tr_indices[:, 0]]
    train_Y = labels[tr_indices[:, 0]]
    test_indices = pd.read_csv(tst_path, sep=" ", header=None)
    tst_indices = train_indices.to_numpy()
    test_X = features[tr_indices[:, 0]]
    test_Y = labels[tr_indices[:, 0]]
    return train_X, train_Y, test_X, test_Y
예제 #3
0
def get_discard_set(filepath, type, rate):
    assert type in ['cumsum', 'rank']
    # count labels -> where to? only in train
    features, labels, num_samples, num_features, num_labels = data_utils.read_data(
        filepath)
    # get labels with few instances
    instance, labels_flatten = labels.nonzero()
    count = Counter(labels_flatten)
    count_np = np.zeros(num_labels).astype(np.int32)
    for k, v in count.items():
        count_np[k] = v
    idx = np.argsort(count_np)
    sorted_count = np.sort(count_np)
    if type == 'cumsum':
        percentile = np.cumsum(sorted_count) / sorted_count.sum()
        discard_sets = [set(idx[np.nonzero(percentile < r)]) for r in rate]
    elif type == 'rank':
        discard_sets = [set(idx[0:int(len(idx) * r)]) for r in rate]
    return discard_sets, count_np
예제 #4
0
    cuda = torch.cuda.is_available()
    R = model_cfg['r']
    b = model_cfg['b']
    num_labels = data_cfg["num_labels"]
    dest_dim = model_cfg['dest_dim']
    name = data_cfg['name']
    prefix = data_cfg['prefix']
    record_dir = data_cfg["record_dir"]
    data_dir = os.path.join("data", name)
    label_path = os.path.join(record_dir, "_".join(
        [prefix, str(num_labels), str(b),
         str(R)]))  # Bibtex_159_100_32

    test_file = os.path.join(data_dir, name + "_test.txt")
    features, labels, num_samples, num_features, num_labels = data_utils.read_data(
        test_file)
    instance, labels_flatten = labels.nonzero()
    count = Counter(labels_flatten)
    unsorted_count = np.zeros(num_labels, dtype=np.int)

    for k, v in count.items():
        unsorted_count[k] = v
    sorted_idx = np.flip(np.argsort(unsorted_count))
    thres = [5, 10, 20]
    for r in tqdm.tqdm(range(R)):
        # use feature hashing to map back
        counts, label_mapping, inv_mapping = get_label_hash(label_path, r)
        mapped_labels = label_mapping[labels_flatten]
        m_count = Counter(mapped_labels)
        zero_count = b - len(m_count)
        # hashed label -> hashed count
예제 #5
0
 def loadData(cls, file_path):
     X, labels, _, _, _ = data_utils.read_data(file_path)
     return X, labels
    # load dataset
    test_file = os.path.join(data_dir, prefix + "_test.txt")
    label_path = os.path.join(record_dir, "_".join(
        [prefix, str(num_labels), str(b),
         str(R)]))  # Bibtex_159_100_32

    pred_avg_meter = AverageMeter()
    logging.info("Evaluating mAP only config %s" % (a.model))
    logging.info("Dataset config %s" % (a.dataset))
    if a.cost:
        logging.info("Evaluating cost-sensitive method: %s" % (a.cost))

    # get inverse propensity

    _, labels, _, _, _ = data_utils.read_data(test_file)
    inv_propen = xc_metrics.compute_inv_propesity(labels, model_cfg["ps_A"],
                                                  model_cfg["ps_B"])
    ap_meter = meter.APMeter()

    a.__dict__['rep'] = 0
    single_model_dir = get_model_dir(data_cfg, model_cfg, a)
    gt_filename = os.path.join(single_model_dir, "gt.npz")
    gt = scipy.sparse.load_npz(gt_filename).tocsc()
    # get label mappings
    l_maps = []
    for r in range(R):
        counts, label_mapping, inv_mapping = get_label_hash(label_path, r)
        l_maps.append(label_mapping)
    l_maps = np.stack(l_maps, axis=0)  # R x #labels
    lfu = cachetools.LRUCache(R * a.bs * a.cs)
예제 #7
0
from xclib.data import data_utils
import xclib.evaluation.xc_metrics as xc_metrics
import numpy as np

dataset = 'eurlex'
# Read file with features and labels
features, labels, num_samples, num_features, num_labels = data_utils.read_data(
    'data/' + dataset + '/' + 'train.txt')

A, B = 0.55, 1.5
inv_propen = xc_metrics.compute_inv_propesity(labels, A, B)
np.savetxt('inv_prop.txt', inv_propen)

data_utils.write_sparse_file(features, "trn_X_Xf.txt")
data_utils.write_sparse_file(labels, "trn_X_Y.txt")

features, labels, num_samples, num_features, num_labels = data_utils.read_data(
    'data/' + dataset + '/' + 'test.txt')
data_utils.write_sparse_file(features, "tst_X_Xf.txt")
data_utils.write_sparse_file(labels, "tst_X_Y.txt")
예제 #8
0
 def loadData(cls, file_path):
     X, labels, _, _, _ = data_utils.read_data(file_path)
     X = normalize(X)
     return X, labels
예제 #9
0
def get_matrix_from_txt(path, isSparse):
    if (isSparse):
        labels = data_utils.read_sparse_file('trn_X_Xf.txt', force_header=True)
    features, labels, num_samples, num_features, num_labels = data_utils.read_data(
        path)
    return features.toarray(), labels.toarray().astype(int)
예제 #10
0
def load_rcv_data(path):
    features, labels, num_samples, num_features, num_labels = data_utils.read_data(
        path)
    return features, labels
예제 #11
0
def get_matrix_from_txt(path):
    features, labels, num_samples, num_features, num_labels = data_utils.read_data(
        path)
    return features.toarray(), labels.toarray().astype('int')