def loadMatrix(matrixFileName): """ Load the sparse matrix in the libsvm format from the given file. Returns the csr matrix and an index to row ids. e.g. rowids is a list of row ids that precede each line. """ return load_svmlight_file(matrixFileName)
def get_sampled(datafile): n_sample = 100 t_label = '3' output_num = 0 sampled_file = os.path.join(tmp_dir, 'tmp.sample') fout = open(sampled_file, 'w') for idx, line in enumerate(open(datafile)): if n_sample == 0: break tokens = line.split(' ') label = tokens[0] if t_label == label: print output_num, idx fout.write(line) n_sample -= 1 output_num += 1 fout.close() X, y = load_svmlight_file(sampled_file) return X
def test_load_svmlight_file(): X, y = load_svmlight_file(datafile) # test X's shape assert_equal(X.indptr.shape[0], 4) assert_equal(X.shape[0], 3) assert_equal(X.shape[1], 20) assert_equal(y.shape[0], 3) # test X's non-zero values for i, j, val in ((0, 1, 2.5), (0, 9, -5.2), (0, 14, 1.5), (1, 4, 1.0), (1, 11, -3), (2, 19, 27)): assert_equal(X[i, j], val) # tests X's zero values assert_equal(X[0, 2], 0) assert_equal(X[0, 4], 0) assert_equal(X[1, 7], 0) assert_equal(X[1, 15], 0) assert_equal(X[2, 17], 0) # test can change X's values X[0, 1] *= 2 assert_equal(X[0, 1], 5) # test y assert_array_equal(y, [1, 2, 3])
def mergeSparse(f1, f2, f3): X1, y1 = io.load_svmlight_file(f1) X2, y2 = io.load_svmlight_file(f2) if (y1.shape == y2.shape): X = sp.hstack([X1, X2]) else: sys.stderr.write('Error: Different number of examples in files: ' + str(y1.shape) + ' != ' + str(y2.shape) + '\n') return if (y1 == y2).sum() != y1.shape[0]: sys.stderr.write( 'Warning: Label mismatch. Are you merging features of the same subset?\nI will use the labels of the first argument\n' ) y = y1 io.dump_svmlight_file(X, y, f3) return
def test_dump(): try: Xs, y = load_svmlight_file(datafile) tmpfile = "tmp_dump.txt" dump_svmlight_file(Xs, y, tmpfile, zero_based=False) X2, y2 = sk_load_svmlight_file(tmpfile) assert_array_equal(Xs.toarray(), X2.toarray()) assert_array_equal(y, y2) finally: os.remove(tmpfile)
def changeSparseLabels(f1, f2, labelFunction=AvsI): Xold, yold = io.load_svmlight_file(f1) vectorizedFunction = np.vectorize(labelFunction, otypes=[np.int32]) ynew = vectorizedFunction(yold) X = Xold[ynew != 0, :] y = ynew[ynew != 0] io.dump_svmlight_file(X, y, f2) return
def load_data(self, rel_path): ''' Loads data from a SVMLight file using the svmlight_loader library: https://github.com/mblondel/svmlight-loader Returns a list of the dataset and the labels ''' abs_path = os.path.abspath(rel_path) (x_train, labels) = svml.load_svmlight_file(abs_path) return [x_train, labels]
def test_dump(): Xs, y = load_svmlight_file(datafile) Xd = Xs.toarray() for X in (Xs, Xd): f = StringIO() dump_svmlight_file(X, y, f, zero_based=False) f.seek(0) X2, y2 = sk_load_svmlight_file(f) assert_array_equal(Xd, X2.toarray()) assert_array_equal(y, y2)
def test_load_svmlight_file_n_features(): X, y = load_svmlight_file(datafile, n_features=14) # test X'shape assert_equal(X.indptr.shape[0], 4) assert_equal(X.shape[0], 3) assert_equal(X.shape[1], 14) # test X's non-zero values for i, j, val in ((0, 1, 2.5), (0, 9, -5.2), (1, 4, 1.0), (1, 11, -3)): assert_equal(X[i, j], val)
def compute_LMI(matrixFileName): """ We will first read the co-occurrence matrix from matrixFile Name. Next, we will compute the PPMI values for the matrix. """ mat, rowids = load_svmlight_file(matrixFileName) (nrows, ncols) = mat.shape colTotals = np.zeros(ncols, dtype=DTYPE) for j in range(0, ncols): colTotals[j] = np.sum(mat[:, j].data) N = np.sum(colTotals) for i in range(0, nrows): row = mat[i, :] rowTotal = np.sum(row.data) for j in row.indices: mat[i, j] = max(0, np.log( (mat[i, j] * N) / (rowTotal * colTotals[j]))) return mat
def process(): """ Demonstrates the LogisticRegression class """ dataset = "rcv1" rate = 1000 epohs = 500 train_X, train_y = load_svmlight_file("../data/%s/%s.train" % (dataset, dataset)) N, D = train_X.shape train_y = 0.5 * (train_y + numpy.ones(N, dtype=int)) shared_X = numpy.asarray(train_X.toarray(), dtype=theano.config.floatX) shared_y = numpy.asarray(train_y, dtype=theano.config.floatX) x = T.matrix('x') y = T.vector('y') LR = LogisticRegression(x, N, D) cost = LR.negative_log_likelihood(y) g_w = T.grad(cost=cost, wrt=LR.w) g_b = T.grad(cost=cost, wrt=LR.b) updates = [(LR.w, LR.w - rate * g_w), (LR.b, LR.b - rate * g_b)] train_model = theano.function(inputs=[x, y], outputs=cost, updates=updates, allow_input_downcast=True) err, wval, bval = LR.errors(y) test_model = theano.function(inputs=[x, y], outputs=[err, wval, bval], allow_input_downcast=True) print "Train instances =", N print "Dimensionality =", D for t in range(epohs): likelihood = train_model(shared_X, shared_y) err_val, w_val, b_val = test_model(shared_X, shared_y) #norm = numpy.dot(wval, wval) print "Epoh %d: Likelihood = %f Errors = %f b = %s norm = %s" % ( t, likelihood, err_val, str(b_val), str(w_val)) pass
def test_load_invalid_file(): load_svmlight_file(invalidfile)
def main(): #X, y = load_svmlight_file('./heart_scale') #X, y = load_svmlight_file('./large.dat') X, y = load_svmlight_file('./kddb') y[y==0] = -1 gradient(X, y)
def test_not_a_filename(): load_svmlight_file(1)
def test_invalid_filename(): load_svmlight_file("trou pic nic douille")
#!/usr/bin/env python2 import numpy as np import pylab as pl import svmlight_loader as io import sys from sklearn import datasets, svm from sklearn.feature_selection import SelectPercentile, chi2 ############################################################################### # import some data to play with X, y = io.load_svmlight_file('mergedCPK-TP/output_bound10_ps4_f427_cyclic.AvsI') # X, y = io.load_svmlight_file(sys.argv[1]) # ############################################################################### pl.figure(1) pl.clf() X_indices = np.arange(X.shape[-1]) ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(chi2, percentile=10) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() pl.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)',
def load_from_text(filepaths, dtype=np.float32, max_score=None, min_feature=None, max_feature=None, has_sorted_relevances=False, purge=False): ''' Load queries in the svmlight format from the specified file(s). SVMlight format example (one line): 5[\s]qid:8[\s]103:1.0[\s]...[\s]981:1.0 982:1.0 # comment[\n] Parameters: ----------- filepath: string or list of strings The location of the dataset file(s). dtype: data-type, optional (default is np.float32) The desired data-type for the document feature vectors. Here, the default value (np.float32) is chosen for mere optimization purposes. max_score: int, optional (default is None) The maximum relevance score value. If None, the value is derived from the relevance scores in the file. min_feature: int or None, optional (default is None) The minimum feature identifier, which is present in the dataset. If None, this value is read from the data. This parameter is important because of internal feature remapping: in case of loading different parts of a dataset (folds), some features may be present in one part and may not be present in another (because all its values are 0) - this would create inconsistent feature mappings between the parts. max_feature: int or None, optional (default is None) The maximum feature identifier, which is present in the dataset. If None, this value is read from the data. This parameter is important because of internal feature remapping, see `min_feature` for more. has_sorted_relevances: bool, optional (default is False) If True, it indicates that the relevance scores of the queries in the file are sorted in decreasing order. purge: bool, optional (default is False) If True, all queries which have documents with the same relevance labels are removed. If False, no query is removed. ''' # Arrays used to build CSR matrix of query-document vectors. data, indices, indptr = [], [], [0] # Relevance score, query ID, query hash, and document hash. relevances = [] query_ids = [] query_indptr = [0] prev_qid = None # If only single filepath is given, not a list. if isinstance(filepaths, str): filepaths = [filepaths] n_purged_queries = 0 n_purged_documents = 0 def purge_query(qid, data, indices, indptr): '''Remove the last query added to the set according to `purge`.''' raise NotImplemented if not purge or qid is None: return 0 r = relevances[query_indptr[-2]] i = query_indptr[-2] while i < query_indptr[-1] and relevances[i] == r: i += 1 if i == query_indptr[-1]: n = query_indptr.pop() del query_ids[-1] del indices[indptr[query_indptr[-1]]:] del data[indptr[query_indptr[-1]]:] del relevances[query_indptr[-1]:] del indptr[query_indptr[-1] + 1:] return n - query_indptr[-1] else: return 0 for filepath in filepaths: lineno = 0 # Used just to report invalid lines (if any). logger.info('Reading queries from %s.' % filepath) #-------------------------------------------------------------------------------- # Call svmlight_loader here. #-------------------------------------------------------------------------------- (feature_vectors, relevances, qids) = svmlight_loader.load_svmlight_file(filepath, query_ids=True) # Set up query_indptr and query_ids from qids last_query_id = None # The below code mimics the original way of setting up query_indptr. It initializes query_indptr # to [0] then immediately appends a 1. for i in range(0, len(qids)): if qids[i] != last_query_id: query_indptr.append(query_indptr[-1] + 1) query_ids.append(qids[i]) last_query_id = qids[i] else: query_indptr[-1] += 1 logger.info( 'Read %d queries and %d documents out of which ' '%d queries and %d documents were discarded.' % (len(query_indptr) + n_purged_queries - 1, query_indptr[-1] + n_purged_documents, n_purged_queries, n_purged_documents)) # Empty dataset. if len(query_indptr) == 1: raise ValueError('the input seems to be empty') # Set the minimum feature ID, if not given. if min_feature is None: min_feature = min(indices) if max_feature is None: raise NotImplemented # Remap the features for a proper conversion into dense matrix. feature_indices = np.unique(np.r_[min_feature, indices]) indices = np.searchsorted(feature_indices, indices) else: #assert min(indices) >= min_feature, ('there is a feature with id ' # 'smaller than min_feature: %d < %d' % (min(indices), # min_feature)) assert feature_vectors.shape[1] <= max_feature, ( 'there is a feature with id ' 'greater than max_feature: %d > %d' % (feature_vectors.shape[1], max_feature)) feature_indices = np.arange(min_feature, max_feature, dtype='int32') # Free the copies of the feature_vectors in non-Numpy arrays (if any), # this is important in order not to waste memory for the transfer of # the feature vectors to dense format (default option). del data, indices, indptr feature_vectors = feature_vectors.toarray() #-------------------------------------------------------------------------------- # KDR: things that just get passed through: # max_score # has_sorted_relevances #-------------------------------------------------------------------------------- # KDR: things we need to construct: # feature_vectors: CSR matrix # relevances: list of what I would call "labels", one per line # query_indptr: row pointers to the start of each query # query_ids: just a list of query ids in the order they appear (relies on the input being sorted) # feature_indices: sequential list of features (current logic is fine) # Create and return a Queries object. return Queries(feature_vectors, relevances, query_indptr, max_score=max_score, has_sorted_relevances=has_sorted_relevances, query_ids=query_ids, feature_indices=feature_indices)
if (len(sys.argv) > 4) | (len(sys.argv) < 3): sys.stderr.write( 'libsvmFeatureFilter.py: Wrong number of arguments. Must be 2 or 3.\n' ) sys.exit(1) else: if (len(sys.argv) == 3): svmFilename = sys.argv[1] filterFile = sys.stdin outFilename = sys.argv[2] else: # len(sys.argv) == 4 svmFilename = sys.argv[1] filterFile = open(sys.argv[2], 'r') outFilename = sys.argv[3] X, y = io.load_svmlight_file(svmFilename) featureSubset = np.loadtxt(filterFile) filterFile.close() sanitycheck = np.unique(featureSubset) print sanitycheck X_col = sp.csc_matrix(X) if X.shape[1] != featureSubset.shape[0]: sys.stderr.write( 'libsvmFeatureFilter.py: Dimension mismatch of filter and features. Assuming that the feature list is a prefix of the filter list.\n' ) featureSubset.resize(X.shape[1]) # filter columns X_sub = X[:, featureSubset == 1]
#! /usr/bin/env python #from sklearn.datasets import load_svmlight_file from svmlight_loader import load_svmlight_file from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score import pickle print 'Loading feature file...' X, y = load_svmlight_file('FeatureVectorBlueDefault.txt') print 'Training classifier...' clf = RandomForestClassifier(n_estimators=3, criterion='entropy', max_depth=20) clf.fit(X, y) print 'Saving model...' file = open('forest.model', 'w') pickle.dump(clf, file) file.close() #scores = cross_val_score(clf, X, y) #print scores.mean()
def train(self, path): train_sparse, self.training_labels = svml.load_svmlight_file(path) train_dense = train_sparse.todense() self.training_data = np.asarray(train_dense)
import os import math import operator import numpy as np import matplotlib.pyplot as mplt # Number of nearest neighbors to use k = 30 assert k <= 300 labels = ['electronic', 'metal', 'rap', 'classical'] # Load SVMLight files as numpy arrays currdir = os.path.dirname(os.path.abspath(__file__)) trainfile = os.path.join(currdir, "data", "songsv1.train") testfile = os.path.join(currdir, "data", "songsv1.test.txt") x_train, y_train = svm.load_svmlight_file(trainfile) x_test, y_test = svm.load_svmlight_file(testfile) # Convert sparse matrix to dense matrix x_train = x_train.todense() x_test = x_test.todense() x_train = np.array(x_train) x_test = np.array(x_test) # Statistics num_correct = 0 num_correct_electronic = 0 num_correct_metal = 0 num_correct_rap = 0 num_correct_classical = 0
#!/usr/bin/python from scipy.sparse import hstack import svmlight_loader as io import sys if len(sys.argv) < 4: print '''Merge two files in libsvm / svmlight format into a single file. Parameters: inFile1 inFile2 outFile implemented June 2014 by Pascal Welke''' fileOne = sys.argv[1] fileTwo = sys.argv[2] fileThree = sys.argv[3] xOne, yOne = io.load_svmlight_file(fileOne) xTwo, yTwo = io.load_svmlight_file(fileTwo) X = hstack((xOne, xTwo)) io.dump_svmlight_file(X, yOne, fileThree, False)
def load_test_file(self, path): test_sparse, self.test_labels = svml.load_svmlight_file(path) test_dense = test_sparse.todense() self.test_data = np.asarray(test_dense)
#!/usr/bin/python '''Usage: name INFILE OUTFILE PERCENTILE Where INFILE is a file in libSVM format OUTFILE will be a file in libSVM format containing the chosen percentile of top features PERCENTILE is an integer specifying the amount of features you want to keep.''' import numpy as np import pylab as pl import svmlight_loader as io import sys from sklearn import datasets, svm from sklearn.feature_selection import SelectPercentile, chi2 ################################################################# X, y = io.load_svmlight_file(sys.argv[1]) ################################################################# # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(chi2, percentile=int(sys.argv[3])) selector.fit(X, y) ################################################################# # store output in file Xsmall = selector.transform(X) io.dump_svmlight_file(Xsmall, y, sys.argv[2], False)
def get_data(in_filename, n_features, **kwargs): data = load_svmlight_file(in_filename, n_features=n_features, dtype=np.float32) return data[0], data[1]