示例#1
0
def main():
    data_dir = sys.argv[1]
    trn_x = du.read_sparse_file(os.path.join(data_dir, sys.argv[2]))
    trn_y = du.read_sparse_file(os.path.join(data_dir, sys.argv[3]))
    yft_x = du.read_sparse_file(os.path.join(data_dir, sys.argv[4]))
    tmp_mdata = sys.argv[5]
    assert trn_x.shape[0] == trn_y.shape[0], "Number of instances must be same in features and labels"
    num_labels = trn_y.shape[1]
    valid_trn_x = np.where(trn_x.getnnz(axis=1) > 0)[0]
    valid_trn_y = np.where(trn_y.getnnz(axis=1) > 0)[0]
    valid_idx = np.intersect1d(valid_trn_x, valid_trn_y)
    trn_x = trn_x[valid_idx]
    trn_y = trn_y[valid_idx]
    features = np.where(trn_x.getnnz(axis=0) > 0)[0]
    labels = np.where(trn_y.getnnz(axis=0) > 0)[0]
    v_lbs_wrds = np.where(yft_x[labels].getnnz(axis=0) > 0)[0]
    union_fts = np.union1d(v_lbs_wrds, features)
    path = os.path.join(tmp_mdata, 'features_split.txt')
    np.savetxt(path, union_fts, fmt='%d')
    path = os.path.join(tmp_mdata, 'labels_split.txt')
    np.savetxt(path, labels, fmt='%d')
    path = os.path.join(tmp_mdata, 'v_lbs_fts_split.txt')
    np.savetxt(path, union_fts, fmt='%d')
    params = "{},{},{},{}".format(union_fts.size, num_labels,
                                  labels.size, union_fts.size)
    print(params)
    stats_obj = {'header': 'num_features,num_labels,valid_num_labels, valid_num_features'}
    stats_obj['all'] = params
    json.dump(stats_obj, open(os.path.join(
        tmp_mdata, "split_stats.json"), 'w'), indent=4)
示例#2
0
def main(targets_file, train_file, predictions_file, A, B):
    """
        Args:
            targets_file: test labels
            train_file: train labels (to compute prop)
            prediction_file: predicted labels
            A: int: to compute propensity
            B: int: to compute propensity
    """
    true_labels = data_utils.read_sparse_file(targets_file)
    predicted_labels = data_utils.read_sparse_file(predictions_file)
    inv_psp = compute_inv_propensity(train_file, A, B)
    acc = xc_metrics.Metrics(true_labels=true_labels, inv_psp=inv_psp)
    args = acc.eval(predicted_labels, 5)
    print(xc_metrics.format(*args))
示例#3
0
 def read_predictions(self, fname):
     if self.ftype == 'mat':
         return loadmat(fname)['predicted_labels']
     elif self.ftype == 'txt':
         return data_utils.read_sparse_file(fname)
     elif self.ftype == 'npz':
         return load_npz(fname)
示例#4
0
def main(targets_label_file, train_label_file, predictions_file, A, B, docs, lbls):
    true_labels = _remove_overlap(
        data_utils.read_sparse_file(
            targets_label_file, force_header=True).tolil(),
        docs, lbls)
    trn_labels = data_utils.read_sparse_file(
        train_label_file, force_header=True)
    inv_propen = xc_metrics.compute_inv_propesity(trn_labels, A=A, B=B)
    acc = xc_metrics.Metrics(
        true_labels, inv_psp=inv_propen, remove_invalid=False)
    predicted_labels = _remove_overlap(
        load_npz(predictions_file+'.npz').tolil(),
        docs, lbls)
    rec = xc_metrics.recall(predicted_labels, true_labels, k=20)[-1]*100
    print("R@20=%0.2f" % (rec))
    args = acc.eval(predicted_labels, 5)
    print(xc_metrics.format(*args))
示例#5
0
def prepare_data(f_train_x, f_train_y, f_test_x, f_test_y, f_val_x, f_val_y):
    train_x = data_utils.read_sparse_file(f_train_x).todense()
    train_y = pd.read_csv(f_train_y, header=None)
    train_y = np.array(train_y).reshape(len(train_y), )

    test_x = data_utils.read_sparse_file(f_test_x).todense()
    test_y = pd.read_csv(f_test_y, header=None)
    test_y = np.array(test_y).reshape(len(test_y), )

    val_x = data_utils.read_sparse_file(f_val_x).todense()
    val_y = pd.read_csv(f_val_y, header=None)
    val_y = np.array(val_y).reshape(len(val_y), )

    print('Shape of Trainig data :', train_x.shape, train_y.shape)
    print('Shape of Testing data :', test_x.shape, test_y.shape)
    print('Shape of Validation data :', val_x.shape, val_y.shape)

    return train_x, train_y, test_x, test_y, val_x, val_y
def main(tst_label_fname, trn_label_fname, filter_fname, pred_fname, A, B,
         betas, top_k, save):
    true_labels = data_utils.read_sparse_file(tst_label_fname)
    trn_labels = data_utils.read_sparse_file(trn_label_fname)
    inv_propen = xc_metrics.compute_inv_propesity(trn_labels, A, B)
    mapping = get_filter_map(filter_fname)
    acc = xc_metrics.Metrics(true_labels, inv_psp=inv_propen)
    root = os.path.dirname(pred_fname)
    ans = ""
    if isinstance(betas, list) and betas[0] != -1:
        knn = filter_predictions(load_npz(pred_fname + '_knn.npz'), mapping)
        clf = filter_predictions(load_npz(pred_fname + '_clf.npz'), mapping)
        args = acc.eval(clf, 5)
        ans = f"classifier\n{xc_metrics.format(*args)}"
        args = acc.eval(knn, 5)
        ans = ans + f"\nshortlist\n{xc_metrics.format(*args)}"
        clf = retain_topk(clf, k=top_k)
        knn = retain_topk(knn, k=top_k)
        clf = normalize(sigmoid(clf), norm='max')
        knn = normalize(sigmoid(knn), norm='max')
        for beta in betas:
            predicted_labels = beta * clf + (1 - beta) * knn
            args = acc.eval(predicted_labels, 5)
            ans = ans + f"\nbeta: {beta:.2f}\n{xc_metrics.format(*args)}"
            if save:
                fname = os.path.join(root, f"score_{beta:.2f}.npz")
                save_npz(fname,
                         retain_topk(predicted_labels, k=top_k),
                         compressed=False)
    else:
        predicted_labels = filter_predictions(
            sigmoid(load_npz(pred_fname + '.npz')), mapping)
        args = acc.eval(predicted_labels, 5)
        ans = xc_metrics.format(*args)
        if save:
            print("Saving predictions..")
            fname = os.path.join(root, "score.npz")
            save_npz(fname,
                     retain_topk(predicted_labels, k=top_k),
                     compressed=False)
    line = "-" * 30
    print(f"\n{line}\n{ans}\n{line}")
    return ans
示例#7
0
def compute_inv_propensity(train_file, A, B):
    """
        Compute Inverse propensity values
        Values for A/B:
            Wikpedia-500K: 0.5/0.4
            Amazon-670K, Amazon-3M: 0.6/2.6
            Others: 0.55/1.5
    """
    train_labels = data_utils.read_sparse_file(train_file)
    inv_propen = xc_metrics.compute_inv_propesity(train_labels, A, B)
    return inv_propen
def main(tst_label_fname, trn_label_fname, pred_fname,
         A, B, save, *args, **kwargs):
    true_labels = data_utils.read_sparse_file(tst_label_fname)
    trn_labels = data_utils.read_sparse_file(trn_label_fname)
    inv_propen = xc_metrics.compute_inv_propesity(trn_labels, A, B)
    acc = xc_metrics.Metrics(true_labels, inv_psp=inv_propen)
    root = os.path.dirname(pred_fname[-1])
    predicted_labels = read_files(pred_fname)
    ens_predicted_labels = merge(predicted_labels)
    ans = ""
    for idx, pred in enumerate(predicted_labels):
        args = acc.eval(pred, 5)
        ans = ans + f"learner: {idx}\n{xc_metrics.format(*args)}\n"
    args = acc.eval(ens_predicted_labels, 5)
    ans = ans + f"Ensemble\n{xc_metrics.format(*args)}"
    if save:
        print("Saving predictions..")
        fname = os.path.join(root, "score.npz")
        save_npz(fname, ens_predicted_labels, compressed=False)
    line = "-"*30
    print(f"\n{line}\n{ans}\n{line}")
    return ans
示例#9
0
 def load(self, data_dir, fname, X):
     if X is not None:
         return X
     else:
         assert fname is not None, "Filename can not be None."
         fname = os.path.join(data_dir, fname)
         if fname.lower().endswith('.pkl'):
             return pickle.load(open(fname, 'rb'))['X']
         elif fname.lower().endswith('.txt'):
             return data_utils.read_sparse_file(
                 fname, dtype=np.float32)
         else:
             raise NotImplementedError("Unknown file extension")
示例#10
0
 def load(self, data_dir, fname, Y):
     if Y is not None:
         return Y
     elif fname is None:
         return None
     else:
         fname = os.path.join(data_dir, fname)
         if fname.lower().endswith('.pkl'):
             return pickle.load(open(fname, 'rb'))['Y']
         elif fname.lower().endswith('.txt'):
             return data_utils.read_sparse_file(fname,
                                                dtype=np.float32,
                                                safe_read=False)
         else:
             raise NotImplementedError("Unknown file extension")
def run(feat_fname, lbl_fname, feature_type, method, threshold, seed, tmp_dir):
    np.random.seed(seed)
    if feature_type == 'dense':
        features = data_utils.read_gen_dense(feat_fname)
    elif feature_type == 'sparse':
        features = data_utils.read_gen_sparse(feat_fname)
    else:
        raise NotImplementedError()
    labels = data_utils.read_sparse_file(lbl_fname)
    assert features.shape[0] == labels.shape[0], \
        "Number of instances must be same in features and labels"
    num_features = features.shape[1]
    stats_obj = {}
    stats_obj['threshold'] = threshold
    stats_obj['method'] = method

    sd = SurrogateMapping(method=method,
                          threshold=threshold,
                          feature_type=feature_type)
    sd.fit(features, labels)
    stats_obj['surrogate'] = "{},{},{}".format(num_features,
                                               sd.num_surrogate_labels,
                                               sd.num_surrogate_labels)
    stats_obj['extreme'] = "{},{},{}".format(num_features, sd.num_labels,
                                             len(sd.valid_labels))

    json.dump(stats_obj,
              open(os.path.join(tmp_dir, "data_stats.json"), 'w'),
              indent=4)

    np.savetxt(os.path.join(tmp_dir, "valid_labels.txt"),
               sd.valid_labels,
               fmt='%d')
    np.savetxt(os.path.join(tmp_dir, "surrogate_mapping.txt"),
               sd.mapping,
               fmt='%d')
示例#12
0
import time
import math
import pdb
from xclib.data import data_utils
import hnswlib

lbl_ft_file = sys.argv[1]
model_file = sys.argv[2]
M = int(sys.argv[3])
efC = int(sys.argv[4])
num_threads = int(sys.argv[5])
num_ft = int(sys.argv[6])
metric_space = sys.argv[7]

start = time.time()
data = data_utils.read_sparse_file(lbl_ft_file)
end = time.time()
start = time.time()
index = nmslib.init(method='hnsw',
                    space='cosinesimil_sparse',
                    data_type=nmslib.DataType.SPARSE_VECTOR)
index.addDataPointBatch(data)
index.createIndex({
    'M': M,
    'indexThreadQty': num_threads,
    'efConstruction': efC
})
end = time.time()
print('Training time of ANNS datastructure = %f' % (end - start))
nmslib.saveIndex(index, model_file)
示例#13
0
from xclib.data import data_utils
from scipy import stats
import scipy
import numpy as np
import math
import time
# Read sparse file
labels = data_utils.read_sparse_file('train_x.txt')

x1 = scipy.sparse.csr_matrix.todense(labels)

x = np.asarray(x1)

#x = x2[u]

f = open("train_y.txt", "r")
contents = f.read()
y1 = list(map(int, contents.split()))

y = np.array(y1)
y5 = np.array(y1)

#y = y2[u]


#print(y_prime)
class Node:
    def __init__(self):
        self.index = -1
        self.label = -1
        self.leaf = False
示例#14
0
    ]
    print("len(trn_point_titles), len(tst_point_titles), len(label_titles) = ",
          len(trn_point_titles), len(tst_point_titles), len(label_titles))

    trn_point_features = np.load(
        "{}/{}CondensedData/trn_point_embs.npy".format(DATASET, EMB_TYPE))
    label_features = np.load("{}/{}CondensedData/label_embs.npy".format(
        DATASET, EMB_TYPE))
    tst_point_features = np.load(
        "{}/{}CondensedData/tst_point_embs.npy".format(DATASET, EMB_TYPE))
    print(
        "trn_point_features.shape, tst_point_features.shape, label_features.shape",
        trn_point_features.shape, tst_point_features.shape,
        label_features.shape)

    trn_X_Y = data_utils.read_sparse_file("{}/trn_X_Y.txt".format(DATASET),
                                          force_header=True)
    tst_X_Y = data_utils.read_sparse_file("{}/tst_X_Y.txt".format(DATASET),
                                          force_header=True)

    tst_valid_inds, trn_X_Y, tst_X_Y_trn, tst_X_Y_val, node_features, valid_tst_point_features, label_remapping, adjecency_lists, NUM_TRN_POINTS = prepare_data(
        trn_X_Y, tst_X_Y, trn_point_features, tst_point_features,
        label_features, trn_point_titles, tst_point_titles, label_titles, args)

    hard_negs = [[] for i in range(node_features.shape[0])]

    print("trn_X_Y.shape, tst_X_Y_trn.shape, tst_X_Y_val.shape", trn_X_Y.shape,
          tst_X_Y_trn.shape, tst_X_Y_val.shape)

    temp = [
        line.strip().split() for line in open(
            "{}/filter_labels_test.txt".format(DATASET), "r").readlines()
示例#15
0
def get_accuracy(clf, a, b):
    i = 0
    cnt = 0
    for x in clf.predict(a):
        if (x == b[i]):
            cnt += 1
        i += 1
    return (cnt / i)


if __name__ == "__main__":
    import sys
    from sklearn.datasets import load_iris
    #___________________________________________________________________________________
    train_x = data_utils.read_sparse_file(train_x_path)
    train_y = load_y(train_y_path, train_x.shape[0])

    test_x = data_utils.read_sparse_file(test_x_path)
    test_y = load_y(test_y_path, test_x.shape[0])

    val_x = data_utils.read_sparse_file(val_x_path)
    val_y = load_y(val_y_path, val_x.shape[0])

    i = 0
    train_data = np.zeros(shape=train_size)
    for x in train_x.toarray():
        train_data[i] = [int(i) for i in x]
        i += 1
    train_data = train_data[:100, :]
    train_y = train_y[:100]
示例#16
0
def read_labels(f_name):
    f = pd.read_csv(f_name, header=None, encoding='ISO-8859-1')
    f = f.to_numpy()
    return f


# In[3]:

Y_test = read_labels(
    '/home/shreya/Sem6/COL774/A3/virus/ass3_parta_data/test_y.txt')
Y_train = read_labels(
    '/home/shreya/Sem6/COL774/A3/virus/ass3_parta_data/train_y.txt')

x_test = data_utils.read_sparse_file(
    '/home/shreya/Sem6/COL774/A3/virus/ass3_parta_data/test_x.txt',
    force_header=True)
x_train = data_utils.read_sparse_file(
    '/home/shreya/Sem6/COL774/A3/virus/ass3_parta_data/train_x.txt',
    force_header=True)

# In[4]:

# X_train = np.vstack((x_train[0].toarray(),  x_train[1].toarray()))
# for i in range(2,x_train.shape[0]):
#     l = x_train[i].toarray()
#     X_train = np.vstack((X_train,  x_train[i].toarray()))

X_train = x_train.toarray()

# In[5]:
示例#17
0
def get_matrix_from_txt(path, isSparse):
    if (isSparse):
        labels = data_utils.read_sparse_file('trn_X_Xf.txt', force_header=True)
    features, labels, num_samples, num_features, num_labels = data_utils.read_data(
        path)
    return features.toarray(), labels.toarray().astype(int)
示例#18
0
from matplotlib import pyplot as plt
from collections import deque
import copy
import sys
from treeclass import Decision_tree_classifier



train_x_path = sys.argv[1]
train_y_path = sys.argv[2]
test_x_path = sys.argv[3]
test_y_path = sys.argv[4]
val_x_path = sys.argv[5]
val_y_path = sys.argv[6]

train_x = data_utils.read_sparse_file(train_x_path, force_header=True).toarray()
test_x= data_utils.read_sparse_file(test_x_path, force_header=True).toarray()
val_x = data_utils.read_sparse_file(val_x_path, force_header=True).toarray()
train_y = np.array(pd.read_csv(train_y_path, header = None))
test_y = np.array(pd.read_csv(test_y_path, header = None))
val_y = np.array(pd.read_csv(val_y_path, header = None))

def plot_node_acc(test_score,train_acc, val_score,x, var_param_name ):
    ax= plt.figure(figsize=(12,7))
    plt.plot(x, train_acc,color='green')
    plt.plot(x,test_score)
    plt.plot(x,val_score, color='r')
    plt.legend(['Train Accuracy', 'Test Accuracy', 'Validation Accuracy'])
    plt.xlabel(var_param_name)
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Number of nodes')
示例#19
0
def readFileSparse(path):
    return data_utils.read_sparse_file(path, header=True)
示例#20
0
model_file = sys.argv[2]
num_ft = int(sys.argv[3])
num_lbls = int(sys.argv[4])
efS = int(sys.argv[5])
num_nbrs = int(sys.argv[6])
write_dist = int(sys.argv[7])
out_dir = sys.argv[8]
num_thread = int(sys.argv[9])
num_out_threads = int(sys.argv[10])
metric_space = sys.argv[11]
lbl_ft_file = sys.argv[12]

index = nmslib.init(method='hnsw',
                    space='cosinesimil_sparse',
                    data_type=nmslib.DataType.SPARSE_VECTOR)
data = data_utils.read_sparse_file(lbl_ft_file)
index.addDataPointBatch(data)
nmslib.loadIndex(index, model_file)

index.setQueryTimeParams({'efSearch': efS, 'algoType': 'old'})

start = time.time()
query = data_utils.read_sparse_file(tst_ft_file)
end = time.time()
start = time.time()
nbrs = index.knnQueryBatch(query, k=num_nbrs, num_threads=num_thread)
end = time.time()
print('Time taken to find approx nearest neighbors = %f' % (end - start))

batch_size = int(math.ceil(float(len(nbrs)) / float(num_out_threads)))
for i in range(num_out_threads):
示例#21
0
# In[1]:

from xclib.data import data_utils
import numpy as np
import pandas as pd
from math import log2
import time
import matplotlib.pyplot as plt
import sys

# # Reading the data-set

# In[2]:

# Read sparse file
train_x = data_utils.read_sparse_file(sys.argv[1], force_header=True)
train_x = np.array(train_x.toarray(), dtype=int)
train_y = pd.read_csv(sys.argv[2], sep="\n", header=None).to_numpy()

test_x = data_utils.read_sparse_file(sys.argv[3], force_header=True)
test_x = np.array(test_x.toarray(), dtype=int)
test_y = pd.read_csv(sys.argv[4], sep="\n", header=None).to_numpy()

val_x = data_utils.read_sparse_file(sys.argv[5], force_header=True)
val_x = np.array(val_x.toarray(), dtype=int)
val_y = pd.read_csv(sys.argv[6], sep="\n", header=None).to_numpy()

# # Calculating Entropy

# In[3]:
                    metavar='INIT_RATIO',
                    type=float,
                    help='set initial ratio of labels for pretraining')

args = parser.parse_args()


def csr2list(M):
    row, col, _ = find(M)
    res = [[] for _ in range(M.shape[0])]
    for r, c in zip(row, col):
        res[r].append(c)
    return res


Ytr = data_utils.read_sparse_file(args.trnYfile, force_header=True)
Yte = data_utils.read_sparse_file(args.tstYfile, force_header=True)
#prob = data_utils.read_sparse_file(args.model_dir + "/overall_score_mat_init_ratio_50_batch_size_" + str(args.batch_size), force_header=True)
prob = data_utils.read_sparse_file(args.score, force_header=True)

# dense label matrix
ground_truth = Yte.toarray().astype(np.int32)

mlb = MultiLabelBinarizer(range(Yte.shape[1]), sparse_output=True)
targets = mlb.fit_transform(csr2list(Yte))
train_labels = csr2list(Ytr)
if args.dataset.startswith('WikiPedia'):
    a, b = 0.55, 0.1
elif args.dataset.startswith('Amazon-'):
    a, b = 0.6, 2.6
else:
import pickle
from tqdm import tqdm
#Loading the data first

if len (sys.argv) != 7 :
    print("Please pass the required 6 arguments. ")
    sys.exit (1)

trnxPath = sys.argv[1]
trnyPath = sys.argv[2]
tstxPath = sys.argv[3]
tstyPath = sys.argv[4]
valxPath = sys.argv[5]
valyPath = sys.argv[6]

trainX = data_utils.read_sparse_file(trnxPath).toarray()
trainY = pd.read_csv(trnyPath,header=None).to_numpy()
testX = data_utils.read_sparse_file(tstxPath).toarray()
testY = pd.read_csv(tstyPath,header=None).to_numpy()
validX = data_utils.read_sparse_file(valxPath).toarray()
validY = pd.read_csv(valyPath,header=None).to_numpy()


# In[2]:


print(trainX.shape)
print(testX.shape)
print(validX.shape)