def load_data( rundir, dataset , normalize_cols=False): """ Given a rundir with an LDA output theta.npy in it, and a dataset dir which contains a categories.txt this function sets up the ( (X_train, y_train, train_ids), (X_test, y_test, test_ids) ) """ th = np.load( os.path.join( rundir, "theta.npy") ) ndata, nfeatures = th.shape if normalize_cols: for t in range(0,nfeatures): th[:,t]=1.0/np.std( th[:,t] ) * th[:, t] ##### MAIN SETTINGS FOR DATA SET ###################################################################### if not os.path.exists( dataset ): print "Error: dataset dir does not exist..." return -1 DATA_PARENT_DIR=dataset print " LOADING Corpus Data from: " + DATA_PARENT_DIR VOCAB_FILE = DATA_PARENT_DIR+"NIPS_vocab.txt" DOCS_FILE = DATA_PARENT_DIR+"NIPS_counts.mm" IDS_FILE = DATA_PARENT_DIR+"NIPS_doc_names.txt" # none of these are used here... but nice block of code to keep :) catsf = open( os.path.join(DATA_PARENT_DIR, 'NIPS_categories.txt'), 'r' ) categories = np.array( [ int( cid.strip() ) for cid in catsf.readlines() ] ) catsf.close() # X is theta [ [ p(t1|d1) p(t2|d1) ... p(tT|d1) ] # [ p(t1|d2) p(t2|d2) ... p(tT|d2) ] # ... # [ p(t1|dD) ... p(tT|dD) ] ] # y are the labels [1 ... 8 ] # # custom function that ensures that exactly 10 samples of each label # are being put in the test set ( (trainX,trainY,train_ids), (testX, testY,test_ids) ) = random_split(th, categories, size=10) return ( (trainX,trainY,train_ids), (testX, testY,test_ids) )
def calculate_success( theta, categories): ((TrainX,TrainY,TrainIDs), (TestX,TestY,TestIDs) ) = random_split( theta, categories ) tc = ThresholdClassfier( TrainX, TrainY) numTestDocs = TestX.shape[0] scores = np.zeros(10) for t_id in range(0,numTestDocs): (g,top) = tc.classify(TestX[t_id]) truth = TestY[t_id] if g == truth: scores[truth] += 1 print sum(scores), "/ 80" print scores
def load_data(rundir, dataset, normalize_cols=False): """ Given a rundir with an LDA output theta.npy in it, and a dataset dir which contains a categories.txt this function sets up the ( (X_train, y_train, train_ids), (X_test, y_test, test_ids) ) """ th = np.load(os.path.join(rundir, "theta.npy")) ndata, nfeatures = th.shape if normalize_cols: for t in range(0, nfeatures): th[:, t] = 1.0 / np.std(th[:, t]) * th[:, t] ##### MAIN SETTINGS FOR DATA SET ###################################################################### if not os.path.exists(dataset): print "Error: dataset dir does not exist..." return -1 DATA_PARENT_DIR = dataset print " LOADING Corpus Data from: " + DATA_PARENT_DIR VOCAB_FILE = DATA_PARENT_DIR + "NIPS_vocab.txt" DOCS_FILE = DATA_PARENT_DIR + "NIPS_counts.mm" IDS_FILE = DATA_PARENT_DIR + "NIPS_doc_names.txt" # none of these are used here... but nice block of code to keep :) catsf = open(os.path.join(DATA_PARENT_DIR, 'NIPS_categories.txt'), 'r') categories = np.array([int(cid.strip()) for cid in catsf.readlines()]) catsf.close() # X is theta [ [ p(t1|d1) p(t2|d1) ... p(tT|d1) ] # [ p(t1|d2) p(t2|d2) ... p(tT|d2) ] # ... # [ p(t1|dD) ... p(tT|dD) ] ] # y are the labels [1 ... 8 ] # # custom function that ensures that exactly 10 samples of each label # are being put in the test set ((trainX, trainY, train_ids), (testX, testY, test_ids)) = random_split(th, categories, size=10) return ((trainX, trainY, train_ids), (testX, testY, test_ids))