def test_load_svmlight_files(): X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2, dtype=np.float32) assert_array_equal(X_train.toarray(), X_test.toarray()) assert_array_equal(y_train, y_test) assert_equal(X_train.dtype, np.float32) assert_equal(X_test.dtype, np.float32) X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3, dtype=np.float64) assert_equal(X1.dtype, X2.dtype) assert_equal(X2.dtype, X3.dtype) assert_equal(X3.dtype, np.float64)
def classify_test(feature_list=[], classifiers=[], root_path='./'): #load data set datasets = [] for name in feature_list: logging.log(logging.DEBUG, 'loading data: %s ...' % name) filenames = tuple(['./feature/%s_%s' % (name, tag) for tag in ['train.txt', 'test.txt']]) X_train, y_train, X_test, y_test = load_svmlight_files(filenames) datasets.append((name, X_train, y_train, X_test, y_test)) #make directory to store results result_path = path.join(root_path, 'results') if path.exists(result_path): assert path.isdir(result_path), 'data must be a directory!' else: system('mkdir ' + result_path) for clf in classifiers: for feature in datasets: clf_name = clf.__class__.__name__ feature_name, X_train, y_train, X_test, y_test = feature combine_name = feature_name+'_'+clf_name info = {} logging.log(logging.DEBUG, 'classification test: %s ...' % combine_name) logging.log(logging.DEBUG, 'training...') t0 = time() clf.fit(X_train, y_train) t1 = time() info['training_time'] = t1-t0 logging.log(logging.DEBUG, 'testing on training...') pred_y = clf.predict(X_train) training_acc = accuracy_score(y_train, pred_y) logging.log(logging.DEBUG, 'error rate on training set: %f' % (1.0 - training_acc)) info['training_error'] = 1.0 - training_acc fout = open(path.join(result_path, combine_name+'_train.txt'), 'w') for y in pred_y: print >>fout, y fout.close() logging.log(logging.DEBUG, 'testing...') t0 = time() pred_y = clf.predict(X_test) t1 = time() info['test_time'] = t1-t0 test_acc = accuracy_score(y_test, pred_y) logging.log(logging.DEBUG, 'error rate on test set: %f' % (1.0 - test_acc)) info['test_error'] = 1.0 - test_acc fout = open(path.join(result_path, combine_name+'_test.txt'), 'w') for y in pred_y: print >>fout, y fout.close() yield combine_name, feature_name, clf_name, info
def pCoverX(featureFamily): os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train") path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\" data_df = pd.DataFrame() n_guass = 2 train_post_array = [] test_post_array = [] val_post_array = [] train_entropy_array = [] test_entropy_array = [] val_entropy_array = [] fileType = featureFamily+'*.gz' for file in glob.glob(fileType): print(file) X_train, y_train, X_test, y_test,X_val, y_val = load_svmlight_files((gzip.open(path+"train\\"+file), gzip.open(path+"test\\"+file),gzip.open(path+"validation\\"+file))) #X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt")) X_train = X_train[y_train!=31] X_test = X_test[y_test!=31] X_val = X_val[y_val!=31] y_train = y_train[y_train!=31] y_test = y_test[y_test!=31] y_val = y_val[y_val!=31] #========================= Feature Selection using Variance Thresold ============================================================= X_train_new, X_test_new , X_val_new = featureSelection(X_train,X_test,X_val,y_train, log=True,tech = 'LinearSVC') #========================= Mixture of Guassian ============================================================ train_prob,test_prob,val_prob = pXoverC(X_train_new, y_train, X_test_new, y_test, X_val_new, y_val, n_guass) #========================= Calculating Prior, Posterior and Entropy ============================================================ prr = prior(y_train) train_post = posterior(train_prob,prr) train_entropy = entropy(train_post) train_post_array.append(train_post) train_entropy_array.append(train_entropy) test_post = posterior(test_prob,prr) test_entropy = entropy(test_post) test_post_array.append(test_post) test_entropy_array.append(test_entropy) val_post = posterior(val_prob,prr) val_entropy = entropy(val_post) val_post_array.append(val_post) val_entropy_array.append(val_entropy) train_acc,c_mat = checkAccuracy(train_post,y_train) test_acc,c_mat = checkAccuracy(test_post,y_test) val_acc,c_mat = checkAccuracy(val_post,y_val) temp = pd.DataFrame([[file,train_acc,test_acc,val_acc]]) data_df = data_df.append(temp,ignore_index =True) return train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df
def test_load_zero_based_auto(): data1 = "-1 1:1 2:2 3:3\n" data2 = "-1 0:0 1:1\n" f1 = BytesIO(data1) X, y = load_svmlight_file(f1, zero_based="auto") assert_equal(X.shape, (1, 3)) f1 = BytesIO(data1) f2 = BytesIO(data2) X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto") assert_equal(X1.shape, (1, 4)) assert_equal(X2.shape, (1, 4))
def test_load_with_qid(): # load svmfile with qid attribute data = """ 3 qid:1 1:0.53 2:0.12 2 qid:1 1:0.13 2:0.1 7 qid:2 1:0.87 2:0.12""" X, y = load_svmlight_file(BytesIO(data), query_id=False) assert_array_equal(y, [3, 2, 7]) assert_array_equal(X.todense(), [[0.53, 0.12], [0.13, 0.1], [0.87, 0.12]]) res1 = load_svmlight_files([BytesIO(data)], query_id=True) res2 = load_svmlight_file(BytesIO(data), query_id=True) for X, y, qid in (res1, res2): assert_array_equal(y, [3, 2, 7]) assert_array_equal(qid, [1, 1, 2]) assert_array_equal(X.todense(), [[0.53, 0.12], [0.13, 0.1], [0.87, 0.12]])
def test_load_with_qid(): # load svmfile with qid attribute data = b(""" 3 qid:1 1:0.53 2:0.12 2 qid:1 1:0.13 2:0.1 7 qid:2 1:0.87 2:0.12""") X, y = load_svmlight_file(BytesIO(data), query_id=False) assert_array_equal(y, [3, 2, 7]) assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]]) res1 = load_svmlight_files([BytesIO(data)], query_id=True) res2 = load_svmlight_file(BytesIO(data), query_id=True) for X, y, qid in (res1, res2): assert_array_equal(y, [3, 2, 7]) assert_array_equal(qid, [1, 1, 2]) assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]])
def main(): x_train, y_train, x_test, y_test = load_svmlight_files( ['data/rank.train', 'data/rank.test']) train_query = pd.read_csv('data/rank.train.query', header=None).values.flatten() model = lgbm.LGBMRanker(num_leaves=50, n_estimators=200, random_state=42) print(model) model.fit(x_train, y_train, group=train_query, eval_metric='ndgc', eval_at=[1, 3, 5]) preds = model.predict(x_test) print(spearmanr(y_test, preds)) print('DONE')
def select_feature(trainfilename, testfilename): def returnCHI(X, y): return chivalue X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename)) featureNum = X_train.get_shape()[1] chivalue = chi2(X_train, y_train) step = featureNum / 20; for i in range(1, 21): selectNum = step * i print "selecting", selectNum, "features" selector = SelectKBest(chi2, k=selectNum) X_train_new = selector.fit_transform(X_train, y_train) X_test_new= selector.transform(X_test) sklearn.datasets.dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False) sklearn.datasets.dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
def load_amazon(source_name, target_name, data_folder=None, verbose=False): if data_folder is None: data_folder = './data/' source_file = data_folder + source_name + '_train.svmlight' target_file = data_folder + target_name + '_train.svmlight' test_file = data_folder + target_name + '_test.svmlight' if verbose: print('source file:', source_file) print('target file:', target_file) print('test file: ', test_file) xs, ys, xt, yt, xt_test, yt_test = load_svmlight_files( [source_file, target_file, test_file]) ys, yt, yt_test = (np.array((y + 1) / 2, dtype=int) for y in (ys, yt, yt_test)) return xs, ys, xt, yt, xt_test, yt_test
def select_feature(trainfilename, testfilename): def returnCHI(X, y): return chivalue X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename), multilabel=True) featureNum = X_train.get_shape()[1] chivalue = chi2(X_train, y_train) step = featureNum / 20; for i in range(1, 21): selectNum = step * i print "selecting", selectNum, "features" selector = SelectKBest(chi2, k=selectNum) X_train_new = selector.fit_transform(X_train, y_train) X_test_new= selector.transform(X_test) dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False) dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
def get_url(num_rows=None): url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/url/url_svmlight.tar.gz' filename = 'url_svmlight.tar.gz' if not os.path.isfile(filename): urlretrieve(url, filename) tar = tarfile.open(filename, "r:gz") tar.extractall() tar.close() num_files = 120 files = ['url_svmlight/Day{}.svm'.format(day) for day in range(num_files)] data = datasets.load_svmlight_files(files) X = vstack(data[::2]) if num_rows is not None: X = X[0:num_rows] return X
def support_vector_machines_datasets(self): """ Support Vector Machines (SVMs) <label> <feature-id>:<feature-value> <feature-id>:<feature-value> 1 qid:2 1:0 2:0 3:1 4:0.2 5:0 2 qid:2 1:1 2:0 3:1 4:0.4 5:0 svmlight SVM Light is a C program by Thorsten Joachims that implements a support vector machine. provides several kernels, such as linear, polynomial, radial basis function, and sigmoid LIBSVM -- A Library for Support Vector Machines, It supports multi-class classification. """ logging.debug('----------------- Support Vector Machines -----------') X_train, y_train = datasets.load_svmlight_file("../data/svmlight/example3/train.dat") print("Support Vector Machines \n" , X_train, y_train) X_train, y_train, X_test, y_test = datasets.load_svmlight_files(("../data/svmlight/example3/train.dat","../data/svmlight/example3/test.dat")) print(' X_train ', X_train, 'y_train ', y_train, ' X_test ', X_test, 'y_test ', y_test)
def setUpClass(cls): """ Download and setup the test fixtures """ from sklearn.datasets import load_svmlight_files # download the test data cls.dpath = 'demo/rank/' src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip' target = cls.dpath + '/MQ2008.zip' urllib.request.urlretrieve(url=src, filename=target) with zipfile.ZipFile(target, 'r') as f: f.extractall(path=cls.dpath) (x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid, y_valid, qid_valid) = load_svmlight_files( (cls.dpath + "MQ2008/Fold1/train.txt", cls.dpath + "MQ2008/Fold1/test.txt", cls.dpath + "MQ2008/Fold1/vali.txt"), query_id=True, zero_based=False) # instantiate the matrices cls.dtrain = xgboost.DMatrix(x_train, y_train) cls.dvalid = xgboost.DMatrix(x_valid, y_valid) cls.dtest = xgboost.DMatrix(x_test, y_test) # set the group counts from the query IDs cls.dtrain.set_group( [len(list(items)) for _key, items in itertools.groupby(qid_train)]) cls.dtest.set_group( [len(list(items)) for _key, items in itertools.groupby(qid_test)]) cls.dvalid.set_group( [len(list(items)) for _key, items in itertools.groupby(qid_valid)]) # save the query IDs for testing cls.qid_train = qid_train cls.qid_test = qid_test cls.qid_valid = qid_valid # model training parameters cls.params = { 'objective': 'rank:pairwise', 'booster': 'gbtree', 'silent': 0, 'eval_metric': ['ndcg'] }
def load_amazon(source_name, target_name, data_folder=None, verbose=False): """ Load the amazon sentiment datasets from svmlight format files inputs: source_name : name of the source dataset target_name : name of the target dataset data_folder : path to the folder containing the files outputs: xs : training source data matrix ys : training source label vector xt : training target data matrix yt : training target label vector xtest : testing target data matrix ytest : testing target label vector """ if data_folder is None: data_folder = 'data/' source_file = data_folder + source_name + '_train.svmlight' target_file = data_folder + target_name + '_train.svmlight' test_file = data_folder + target_name + '_test.svmlight' if verbose: print('source file:', source_file) print('target file:', target_file) print('test file: ', test_file) xs, ys, xt, yt, xtest, ytest = load_svmlight_files( [source_file, target_file, test_file]) # Convert sparse matrices to numpy 2D array xs, xt, xtest = (np.array(X.todense()) for X in (xs, xt, xtest)) # Convert {-1,1} labels to {0,1} labels ys, yt, ytest = (np.array((y + 1) / 2, dtype=int) for y in (ys, yt, ytest)) num_labels = len(set(ys)) ys_onehot = np.eye(num_labels)[ys] yt_onehot = np.eye(num_labels)[yt] ytest_onehot = np.eye(num_labels)[ytest] return xs, ys_onehot, xt, yt_onehot, xtest, ytest_onehot
def get_mq2008(dpath): from sklearn.datasets import load_svmlight_files src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip' target = dpath + '/MQ2008.zip' if not os.path.exists(target): urllib.request.urlretrieve(url=src, filename=target) with zipfile.ZipFile(target, 'r') as f: f.extractall(path=dpath) (x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid, y_valid, qid_valid) = load_svmlight_files( (dpath + "MQ2008/Fold1/train.txt", dpath + "MQ2008/Fold1/test.txt", dpath + "MQ2008/Fold1/vali.txt"), query_id=True, zero_based=False) return (x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid, y_valid, qid_valid)
def select_feature_multilabel(trainfilename, testfilename): def returnIG(X, y): return randval, p X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename), multilabel=True) featurenum = X_train.shape[1] randval = randomValues(X_train, y_train) p = np.ones((featurenum,1), int) p.reshape(featurenum,1) featureNum = X_train.get_shape()[1] step = featureNum / 20; for i in range(1, 21): selectNum = step * i print "selecting", selectNum, "features" selector = SelectKBest(returnIG, k=selectNum) X_train_new = selector.fit_transform(X_train, y_train) X_test_new = selector.transform(X_test) dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False) dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
def run_nblcr(train, test, outfn, grams='123', clf=LogisticRegression(class_weight="auto")): f_train = outfn + '-train.txt' f_test = outfn + '-test.txt' ngram = [int(i) for i in grams] ptrain = [] ntrain = [] for _, row in train.iterrows(): if row['label'] == 1: ptrain.append(tokenize(row['text'], ngram)) elif row['label'] == 0: ntrain.append(tokenize(row['text'], ngram)) pos_counts = build_dict(ptrain, ngram) neg_counts = build_dict(ntrain, ngram) dic, r = compute_ratio(pos_counts, neg_counts) generate_svmlight_file(train, dic, r, ngram, f_train) generate_svmlight_file(test, dic, r, ngram, f_test) X_train, y_train, X_test, _ = load_svmlight_files((f_train, f_test)) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) try: y_prob = clf.predict_proba(X_test) except: # for svm with probability output clf.set_params(probability=True) y_prob_pos = clf.predict(X_test) y_prob_neg = np.ones(X_test.shape[0]) - y_prob_pos y_prob = np.column_stack((y_prob_neg, y_prob_pos)) return y_pred, y_prob
def select_feature(trainfilename, testfilename): def returnIG(X, y): return ig, p X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename)) featurenum = X_train.shape[1] ig = information_gain(X_train, y_train) ig = ig.reshape(featurenum,) p = np.ones((1,featurenum), int) p.reshape(featurenum,1) featureNum = X_train.get_shape()[1] step = featureNum / 20; for i in range(1, 21): selectNum = step * i print "selecting", selectNum, "features" selector = SelectKBest(returnIG, k=selectNum) X_train_new = selector.fit_transform(X_train, y_train) X_test_new = selector.transform(X_test) sklearn.datasets.dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False) sklearn.datasets.dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
def load_e2006(): # laod data feature_tr, label_tr, feature_te, label_te = load_svmlight_files(['./data/E2006.train', \ './data/E2006.test'], n_features=150360) feature = vstack([feature_tr, feature_te]) # expand 1 dimension to labels label = np.concatenate([label_tr, label_te], axis=0) # remove outliers from labels std_y = np.std(label) mean_y = np.mean(label) mask = np.logical_and(label > mean_y - 3.0 * std_y, label < mean_y + 3.0 * std_y) print(f'keep {np.sum(mask)} / {len(mask)} rows') # select rows feature = feature[mask] label = label[mask] # scale labels by standard scaler label = label[:, None] scaler = MinMaxScaler() scaler.fit(label) label = scaler.transform(label).squeeze() return feature * 10, label
def load_dmatrix(filename): ''' NOTE(sneaxiy): XGBoost distributed training using rabit would split CSV/LIBSVM file into N pieces automatically, where N is the worker number. However, in our implementation, we dump different data file into each worker, and each worker should not split the dumped file again when training. Otherwise, some data would be lost. To prevent the automatic data sharding by XGBoost itself, we load the LIBSVM file using 'sklearn.datasets.load_svmlight_file' to be a CSR sparse matrix first, and then convert it to 'xgboost.DMatrix'. See https://github.com/sql-machine-learning/sqlflow/issues/2326 in detailed. ''' if xgb.rabit.get_world_size() > 1: # XGBoost DMatrix supports to load data from file path like # "train.txt#train.txt.cache". The actual data path is # "train.txt", while "train.txt.cache" is used as the # external memory cache. But "train.txt#train.txt.cache" # is not a valid file path, and it is not supported by # load_svmlight_file(s). So we remove the suffix "#..." # here before loading the data using load_svmlight_file(s). if '#' in filename: filename = filename[0:filename.index('#')] if os.path.isdir(filename): files = [os.path.join(filename, f) for f in os.listdir(filename)] assert len(files) > 0, "No data file found in {}".format(filename) ret = load_svmlight_files(files, zero_based=True) X = vstack(ret[0::2]) y = np.concatenate(ret[1::2], axis=0) return xgb.DMatrix(X, y, missing=XGBOOST_NULL_MAGIC) else: ret = load_svmlight_file(filename, zero_based=True) return xgb.DMatrix(ret[0], ret[1], missing=XGBOOST_NULL_MAGIC) else: return xgb.DMatrix(filename, missing=XGBOOST_NULL_MAGIC)
def load_dataset(train_path, test_path, threshold=5): """ Generator the yields SvmSet for each set(train, test) Loads the svml format file to Sickit-Learn svml-bases dataset and Normalize the values(the data with MinMaxScalar) If the score is above threshold(defult = 5) so normalized as 1(Positive) else, 0(Negative) :param train_path: train set path :param test_path: test set path :param threshold: threshold to define the pivot-value :return: None """ files = [train_path, test_path] dataset = datasets.load_svmlight_files(files=files, zero_based=True, query_id=True, multilabel=False) for (x, y, qid) in [dataset[i:i + 3] for i in range(0, len(dataset), 3)]: x.data = preprocessing.MinMaxScaler().fit_transform(x.data) for idx, score in enumerate(y): y[idx] = 1 if score > threshold else 0 yield SvmSet(x=x, y=y, qid=qid)
def load_dataset(path_train, path_valid, path_test, n_features, multilabel=False, classes_=None): le = LabelEncoder2(multilabel=multilabel) X, Y, Xvalid, Yvalid, Xtest, Ytest = load_svmlight_files( (path_train, path_valid, path_test), dtype=np.float32, n_features=n_features, multilabel=multilabel) if classes_ is None: le.fit(np.concatenate((Y, Yvalid, Ytest), axis=0)) Y = le.transform(Y) Yvalid = le.transform(Yvalid) Ytest = le.transform(Ytest) else: le.set_classes(classes_) Y = le.transform(Y) Yvalid = le.transform(Yvalid) return X, Y, Xvalid, Yvalid, Xtest, Ytest
def load_amazon(source_name, target_name, data_folder=None, verbose=False): """ Load the amazon sentiment datasets from svmlight format files inputs: source_name : name of the source dataset target_name : name of the target dataset data_folder : path to the folder containing the files outputs: xs : training source data matrix ys : training source label vector xt : training target data matrix yt : training target label vector xtest : testing target data matrix ytest : testing target label vector """ if data_folder is None: data_folder = 'data/' source_file = data_folder + source_name + '_train.svmlight' target_file = data_folder + target_name + '_train.svmlight' test_file = data_folder + target_name + '_test.svmlight' if verbose: print('source file:', source_file) print('target file:', target_file) print('test file: ', test_file) xs, ys, xt, yt, xtest, ytest = load_svmlight_files([source_file, target_file, test_file]) # Convert sparse matrices to numpy 2D array xs, xt, xtest = (np.array(X.todense()) for X in (xs, xt, xtest)) # Convert {-1,1} labels to {0,1} labels ys, yt, ytest = (np.array((y + 1) / 2, dtype=int) for y in (ys, yt, ytest)) return xs, ys, xt, yt, xtest, ytest
def mxTrainer(relationName, train, test, train_pair, test_pair): X_train, y_train, X_test, y_test = load_svmlight_files([train, test]) X_train_col = X_train.shape[1] X_test_col = X_test.shape[1] col = max(X_test_col, X_train_col) train_iter = mx.io.LibSVMIter(data_libsvm=train, data_shape=(col, ), batch_size=100) test_iter = mx.io.LibSVMIter(data_libsvm=test, data_shape=(col, ), batch_size=100) print(test_iter) mod = rankNet() mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) mod.fit(train_iter, num_epoch=5, optimizer="AdaGrad") y_pred = mod.predict(test_iter) print(relationName + str(y_pred.shape) + str(col)) y_pred = y_pred.asnumpy().reshape(y_pred.shape[0]) print(str(y_pred.shape) + str(y_test.shape)) test_pair['score'] = y_pred print(roc_auc_score(y_true=y_test.reshape[y_test.shape[0]], y_score=y_pred)) writeScoresInPraStyle(test_pair, train_pair, relationName)
def main(): args = get_args() # Load training data data_train = load_svmlight_files(args.input) X_train = vstack(data_train[0::2]).toarray() y_train = vstack(data_train[1::2]).toarray() # Make model if args.model == 'rf': model = RandomForestClassifier() param_grid = rf_param_grid() elif args.model == 'svm_rbf': model = SVC() param_grid = svm_rbf_param_grid() # Grid search hyperparameters grid_search = GridSearchCV(estimator=model, scoring='average_precision', param_grid=param_grid, cv=KFold(len(X_train), n_folds=args.kfolds, shuffle=True, random_state=args.seed), n_jobs=args.processes, verbose=2) grid_search.fit(X_train, y_train) pkl.dump(grid_search, open('temp.pkl', 'wb'))
probs.append(score_i) return probs parser = argparse.ArgumentParser() #parser.add_argument( "train_file" ) parser.add_argument( "-p", "--predict", help = "if is to make predictions in a test file", default = None ) parser.add_argument( "-t", "--predict_file", help = "if is to make predictions in a test file", default = None ) parser.add_argument( "-c", "--cross_validation", help = "if have make cross-validation", default = None ) args = parser.parse_args() classifier = LDA(n_components=2) #classifier = RandomForestClassifier() X_url, y, X_title, y_t, X_body, y_b, X_a, y_a = load_svmlight_files(("url_train.txt", "title_train.txt", "body_train.txt", "all_train.txt")) X = {"url":X_url, "title": X_title, "body": X_body, "all": X_a} if(args.predict): print "Predicting" T_url, t, T_title, y_t, T_body, y_b, T_a, y_a = load_svmlight_files(("url_test.txt", "title_test.txt", "body_test.txt", "all_test.txt")) T = {"url": T_url, "title": T_title, "body": T_body, "all": T_a} probs = predict(classifier, X, y, T, t) f = open("sub_31-08_01h15.txt","w") f.write("label\n") for p in probs: line = "%f\n" % p f.write(line) f.close() elif(args.cross_validation):
from sklearn.datasets import load_svmlight_files def documentFrequency(X, y): featurenum = X.shape[1] s = sum(X).toarray() p = np.ones((1, featurenum), int) return s.reshape(featurenum), p.reshape(featurenum, 1) if __name__ == "__main__": if len(sys.argv) != 4: print "Usage: python threshold trainfilename testfilename" exit(1) trainfilename = sys.argv[2] testfilename = sys.argv[3] X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename)) df = sum(X_train).toarray()[0] cnt = 0 threshold = int(sys.argv[1]) for i in range(0, len(df)): if df[i] >= threshold: cnt = cnt + 1 selector = SelectKBest(documentFrequency, k=cnt) X_train = selector.fit_transform(X_train, y_train) X_test = selector.transform(X_test) sklearn.datasets.dump_svmlight_file(X_train, y_train, trainfilename + "_" + str(cnt), zero_based=False) sklearn.datasets.dump_svmlight_file(X_test, y_test, testfilename + "_" + str(cnt), zero_based=False) print cnt, "features selected"
# remove axis spines ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) plt.grid() plt.tight_layout plt.show() os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\dir_data") X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt")) np.unique(y_train) sklearn_lda = LDA(n_components=30) X_lda_sklearn = sklearn_lda.fit_transform(X_train.todense(), y_train) plot_scikit_lda(X_lda_sklearn, title='LDA vision_cuboids_histogram') # PCA sklearn_pca = sklearnPCA(n_components=30) X_pca = sklearn_pca.fit_transform(X_train.todense()) plot_pca(title = 'PCA vision_cuboids_histogram') # X_ldapca_sklearn = sklearn_pca.fit_transform(X_lda_sklearn) plot_scikit_lda(X_ldapca_sklearn, title='LDA+PCA LDA vision_cuboids_histogram', mirror=(-1))
letter_dataset = np.genfromtxt('realData/letter-recognition.data', delimiter=",", converters={0: letter_label}) ndim = len(letter_dataset[0]) data = np.zeros((len(letter_dataset), ndim - 1)) target = np.zeros((len(letter_dataset))) for i in xrange(len(letter_dataset)): target[i] = letter_dataset[i][0].astype(int) # last feature is label for j in xrange(1, ndim): data[i, j - 1] = letter_dataset[i][j] target = target.astype(int) elif (dataset_name == "gas"): files = ["realData/Gas/batch" + str(i) + ".dat" for i in xrange(1, 11)] batches = skd.load_svmlight_files(files) data = batches[0].todense() target = batches[1] len_target = len(target) target = np.reshape(target, (len_target, 1)) for idx in xrange(2, 11): batch_data = batches[(idx - 1) * 2].todense() batch_target = batches[2 * idx - 1] len_batch_target = len(batch_target) batch_target = np.reshape(batch_target, (len_batch_target, 1)) data = np.concatenate((data, batch_data), axis=0) target = np.concatenate((target, batch_target), axis=0)
加载数据 ''' fe_dir = 'xgb_feature_pool' fe_file = 'selected_feature.csv' train_date = '2016-04-06' valid_date = '2016-04-10' test_date = '2016-04-16' train_data = pd.read_csv(os.path.join(fe_dir, train_date, fe_file)) valid_data = pd.read_csv(os.path.join(fe_dir, valid_date, fe_file)) test_data = pd.read_csv(os.path.join(fe_dir, test_date, fe_file)) train_sparse_file = 'XGB输出的稀疏特征/v3.train.svm' valid_sparse_file = 'XGB输出的稀疏特征/v3.valid.svm' X_train_sparse, _, X_valid_sparse, _ = load_svmlight_files([train_sparse_file, valid_sparse_file]) valid_data_9 = pd.read_csv(os.path.join(fe_dir, '2016-04-09', fe_file)) valid_data_11 = pd.read_csv(os.path.join(fe_dir, '2016-04-11', fe_file)) train_data.fillna(-1, inplace=True) valid_data.fillna(-1, inplace=True) test_data.fillna(-1, inplace=True) valid_data_9.fillna(-1, inplace=True) valid_data_11.fillna(-1, inplace=True) print 'Read done.' ''' 得到交叉验证索引
model = Word2Vec.load(model_name) print "Creating the w2v vectors...\n" X_train_w2v = scale(getAvgFeatureVecs(getCleanReviews(train), model, n_dim)) X_test_w2v = scale(getAvgFeatureVecs(getCleanReviews(test), model, n_dim)) print "Generating the svmlight-format files...\n" generate_svmlight_files(train, test, '123', '../data/nbsvm') print "Creating the nbsvm...\n" files = ("../data/nbsvm-train.txt", "../data/nbsvm-test.txt") X_train_nbsvm, _, X_test_nbsvm, _ = load_svmlight_files(files) print "Combing the bag of words and the w2v vectors...\n" X_train_bwv = hstack([X_train_bow, X_train_w2v]) X_test_bwv = hstack([X_test_bow, X_test_w2v]) print "Combing the bag of words and the d2v vectors...\n" X_train_bdv = hstack([X_train_bow, X_train_d2v]) X_test_bdv = hstack([X_test_bow, X_test_d2v]) print "Checking the dimension of training vectors"
def textpCoverX(): os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train") path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\" data_df = pd.DataFrame() train_post_array = [] test_post_array = [] val_post_array = [] train_entropy_array = [] test_entropy_array = [] val_entropy_array = [] for file in glob.glob("text*.gz"): print(file) X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files( (gzip.open(path + "train\\" + file), gzip.open(path + "test\\" + file), gzip.open(path + "validation\\" + file))) X_train = X_train[y_train != 31] X_test = X_test[y_test != 31] X_val = X_val[y_val != 31] y_train = y_train[y_train != 31] y_test = y_test[y_test != 31] y_val = y_val[y_val != 31] svmClf = Pipeline([ ('clf', SGDClassifier(loss='log', penalty='l1', alpha=1e-6, n_iter=10, random_state=88)), ]) svmClf = svmClf.fit(X_train, y_train) predicted_train = svmClf.predict(X_train) train_acc = np.mean(predicted_train == y_train) print "Train Model Accuracy %f" % train_acc train_post = pd.DataFrame(svmClf.predict_proba(X_train)) predicted_test = svmClf.predict(X_test) test_acc = np.mean(predicted_test == y_test) print "Test Model Accuracy %f" % test_acc test_post = pd.DataFrame(svmClf.predict_proba(X_test)) predicted_val = svmClf.predict(X_val) val_acc = np.mean(predicted_val == y_val) print "Validation Model Accuracy %f" % val_acc val_post = pd.DataFrame(svmClf.predict_proba(X_val)) train_entropy = entropy(train_post) train_post_array.append(train_post) train_entropy_array.append(train_entropy) test_entropy = entropy(test_post) test_post_array.append(test_post) test_entropy_array.append(test_entropy) val_entropy = entropy(val_post) val_post_array.append(val_post) val_entropy_array.append(val_entropy) temp = pd.DataFrame([[file, train_acc, test_acc, val_acc]]) data_df = data_df.append(temp, ignore_index=True) return train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df
def test_load_invalid_file2(): with pytest.raises(ValueError): load_svmlight_files([datafile, invalidfile, datafile])
''' 加载数据 ''' fe_dir = 'xgb_feature_pool' fe_file = 'selected_feature.csv' train_date = '2016-04-06' valid_date = '2016-04-10' test_date = '2016-04-16' train_data = pd.read_csv(os.path.join(fe_dir, train_date, fe_file)) valid_data = pd.read_csv(os.path.join(fe_dir, valid_date, fe_file)) test_data = pd.read_csv(os.path.join(fe_dir, test_date, fe_file)) train_sparse_file = 'XGB输出的稀疏特征/v3.train.svm' valid_sparse_file = 'XGB输出的稀疏特征/v3.valid.svm' X_train_sparse, _, X_valid_sparse, _ = load_svmlight_files( [train_sparse_file, valid_sparse_file]) valid_data_9 = pd.read_csv(os.path.join(fe_dir, '2016-04-09', fe_file)) valid_data_11 = pd.read_csv(os.path.join(fe_dir, '2016-04-11', fe_file)) train_data.fillna(-1, inplace=True) valid_data.fillna(-1, inplace=True) test_data.fillna(-1, inplace=True) valid_data_9.fillna(-1, inplace=True) valid_data_11.fillna(-1, inplace=True) print 'Read done.' ''' 得到交叉验证索引 ''' from sklearn.model_selection import KFold
val_post_array.append(val_post) val_entropy_array.append(val_entropy) temp = pd.DataFrame([[file, train_acc, test_acc, val_acc]]) data_df = data_df.append(temp, ignore_index=True) return train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df #=============================================== Main ================================================================= #os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\dir_data") os.chdir("C:\Users\Vaibhav\Desktop\dir_data\dir_data") X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files( ("train\\vision_hist_motion_estimate.txt", "test\\vision_hist_motion_estimate.txt", "validation\\vision_hist_motion_estimate.txt")) #================ First Level of Fusion - Audio =============================== train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df = pCoverX( 'audio') data_df.columns = [ 'filename', 'train Accuracy', 'test Accuracy', 'validation Accuracy' ] data_df.to_csv('Audio_preComb_Acc.csv', index=False) alpha = 1 comb1_audio_train = combiner(train_post_array, train_entropy_array, alpha) comb1_audio_test = combiner(test_post_array, test_entropy_array, alpha) comb1_audio_val = combiner(val_post_array, val_entropy_array, alpha)
def test_load_invalid_file2(): load_svmlight_files([datafile, invalidfile, datafile])
import numpy as np from scipy.sparse import csr_matrix from sklearn import datasets from sklearn.decomposition import PCA from sklearn.preprocessing import MultiLabelBinarizer from sklearn.svm import LinearSVC from mlclas.ensemble import BinaryRelevance, ClassifierChains, CalibratedLabelRanking, RandomKLabelsets, MLKNN from mlclas.tree import MLDecisionTree from mlclas.neural import BPMLL from mlclas.svm import RankingSVM from mlclas.stats import UniversalMetrics files = ['datasets/scene_train', 'datasets/scene_test'] # load files data = datasets.load_svmlight_files(files, multilabel=True) train_data = data[0] train_target = np.array(MultiLabelBinarizer().fit_transform(data[1])) test_data = data[2] test_target = data[3] # feature extraction using PCA feature_size = train_data.shape[1] pca = PCA(n_components=(feature_size * 10) // 100) train_data_trans = csr_matrix(pca.fit_transform(train_data.todense())) test_data_trans = csr_matrix(pca.transform(test_data.todense())) """ train and predict using any of following scripts: 1. result = BinaryRelevance(LinearSVC()).fit(train_data, train_target).predict(test_data)
x = range(len(data)) plt.xticks(x,data[data.columns[0]],rotation='vertical') for i in range(1,len(data.columns)): plt.plot(x,data[data.columns[i]]) plt.legend(data.columns[1:], loc='upper left') plt.xlabel(data.columns[0]) plt.ylabel('Accuracy') plt.title('Accuracy plot for ' + fileName) plt.show() #=============================================== Main ================================================================= os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\Capstone") #os.chdir("C:\Users\Vaibhav\Desktop\dir_data\dir_data") X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_hist_motion_estimate.txt", "test\\vision_hist_motion_estimate.txt","validation\\vision_hist_motion_estimate.txt")) y_train = y_train[y_train!=31] y_test = y_test[y_test!=31] y_val = y_val[y_val!=31] #y_train = y_train[y_train <=2] #y_test = y_test[y_test<=2] #y_val = y_val[y_val<=2] #================ First Level of Fusion - Audio =============================== n_guass =5 nClass = 30 train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df = pCoverX('audio',n_guass,tech = 'LinearSVC',C= 0.5,nClass=30) data_df.columns = ['filename','train Accuracy','test Accuracy','validation Accuracy'] data_df.to_csv('Audio_preComb_Acc0801.csv',index=False) audioComb1Acc = pd.DataFrame() for alpha in [1,2,3,4,5]:
def textpCoverX(): #os.chdir("F:\\Analytics\\ISB Study\\Capstone\\dir_data\\dir_data\\train") #path = "F:\\Analytics\\ISB Study\\Capstone\\dir_data\\dir_data\\" path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\" os.chdir(path+'train') data_df = pd.DataFrame() train_post_array = [] test_post_array = [] val_post_array = [] train_entropy_array = [] test_entropy_array = [] val_entropy_array = [] for file in glob.glob("text*.gz"): print(file) X_train, y_train, X_test, y_test,X_val, y_val = load_svmlight_files((gzip.open(path+"train\\"+file), gzip.open(path+"test\\"+file),gzip.open(path+"validation\\"+file))) X_train = X_train[y_train!=31] X_test = X_test[y_test!=31] X_val = X_val[y_val!=31] y_train = y_train[y_train!=31] y_test = y_test[y_test!=31] y_val = y_val[y_val!=31] svmClf = Pipeline([ ('clf', SGDClassifier(loss='log', penalty='l1',alpha=1e-6, n_iter=10, random_state=88)),]) svmClf = svmClf.fit(X_train, y_train) predicted_train = svmClf.predict(X_train) train_acc = np.mean(predicted_train == y_train) print "Train Model Accuracy %f" % train_acc train_post = pd.DataFrame(svmClf.predict_proba(X_train)) predicted_test = svmClf.predict(X_test) test_acc = np.mean(predicted_test == y_test) print "Test Model Accuracy %f" % test_acc test_post = pd.DataFrame(svmClf.predict_proba(X_test)) predicted_val = svmClf.predict(X_val) val_acc = np.mean(predicted_val == y_val) print "Validation Model Accuracy %f" % val_acc val_post = pd.DataFrame(svmClf.predict_proba(X_val)) train_entropy = entropy(train_post) train_post_array.append(train_post) train_entropy_array.append(train_entropy) test_entropy = entropy(test_post) test_post_array.append(test_post) test_entropy_array.append(test_entropy) val_entropy = entropy(val_post) val_post_array.append(val_post) val_entropy_array.append(val_entropy) temp = pd.DataFrame([[file,train_acc,test_acc,val_acc]]) data_df = data_df.append(temp,ignore_index =True) return train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df
#splitter from sklearn.datasets import load_svmlight_files trn_X, trn_y, tst_X, tst_y = load_svmlight_files(("C:/Users/Ryan/git/nlp/trn.dat", "C:/Users/Ryan/git/nlp/tst.dat")) print trn_X.shape[1]
from sklearn.datasets import load_svmlight_files from sklearn import svm from sklearn.preprocessing import normalize, label_binarize from sklearn.cross_validation import * from sklearn.grid_search import GridSearchCV import pylab as pl # read training data and validation data merged using cat in satimage.scale.train X_train, Y_train, X_test, Y_test = load_svmlight_files(["satimage.scale.train","satimage.scale.t"]) # normalize & binarize X_train = normalize(X_train) Y_train = label_binarize(Y_train,classes=[1,2,3,4,5,6])[:,5] X_test = normalize(X_test) Y_test = label_binarize(Y_test,classes=[1,2,3,4,5,6])[:,5] # build the classifier def svm_score(c,d): clf = svm.SVC(C=c,kernel='poly',degree=d) kfold = KFold(len(Y_train), n_folds=5) scores = cross_val_score(clf,X_train,Y_train,cv=kfold,n_jobs=-1) return scores.mean() x = pl.linspace(1,20,20) y1=[] y2=[] y3=[] for i in x: y1.append(svm_score(i,1)) y2.append(svm_score(i,2))
def get_data(type): return load_svmlight_files( ("../data/Fold1/" + type, "../data/Fold2/" + type, "../data/Fold3/" + type, "../data/Fold4/" + type, "../data/Fold5/" + type))
def fiveFold(): # Feature groups # protocol_dependent = range(13) + range(66,69) # protocol_dependent = range(23) + range(66,69) # peak features # protocol_dependent = range(23,41) # All but peak # protocol_dependent = range(23) + range(41,69) fsslv_cipher_suites = [6, 7, 8, 9, 10, 11, 12] protocol_dependent = [] # Load data data_path = os.getcwd() + "/data_set/libSVM" train_0 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_0_train" test_0 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_0_test" train_1 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_1_train" test_1 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_1_test" train_2 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_2_train" test_2 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_2_test" train_3 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_3_train" test_3 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_3_test" train_4 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_4_train" test_4 = data_path + "/samples_25.2.16_comb_triple.csv_libSVM_4_test" X_train_0, y_train_0, X_test_0, y_test_0 = load_svmlight_files( (train_0, test_0)) X_train_1, y_train_1, X_test_1, y_test_1 = load_svmlight_files( (train_1, test_1)) X_train_2, y_train_2, X_test_2, y_test_2 = load_svmlight_files( (train_2, test_2)) X_train_3, y_train_3, X_test_3, y_test_3 = load_svmlight_files( (train_3, test_3)) X_train_4, y_train_4, X_test_4, y_test_4 = load_svmlight_files( (train_4, test_4)) df_train_0 = pd.DataFrame(X_train_0.toarray()) df_test_0 = pd.DataFrame(X_test_0.toarray()) df_train_1 = pd.DataFrame(X_train_1.toarray()) df_test_1 = pd.DataFrame(X_test_1.toarray()) df_train_2 = pd.DataFrame(X_train_2.toarray()) df_test_2 = pd.DataFrame(X_test_2.toarray()) df_train_3 = pd.DataFrame(X_train_3.toarray()) df_test_3 = pd.DataFrame(X_test_3.toarray()) df_train_4 = pd.DataFrame(X_train_4.toarray()) df_test_4 = pd.DataFrame(X_test_4.toarray()) X_train_0 = df_train_0.drop(protocol_dependent, axis=1) X_test_0 = df_test_0.drop(protocol_dependent, axis=1) X_train_1 = df_train_1.drop(protocol_dependent, axis=1) X_test_1 = df_test_1.drop(protocol_dependent, axis=1) X_train_2 = df_train_2.drop(protocol_dependent, axis=1) X_test_2 = df_test_2.drop(protocol_dependent, axis=1) X_train_3 = df_train_3.drop(protocol_dependent, axis=1) X_test_3 = df_test_3.drop(protocol_dependent, axis=1) X_train_4 = df_train_4.drop(protocol_dependent, axis=1) X_test_4 = df_test_4.drop(protocol_dependent, axis=1) # X_train_0 = randomProtocolValues(X_train_0) # X_test_0 = randomProtocolValues(X_test_0) # X_train_1 = randomProtocolValues(X_train_1) # X_test_1 = randomProtocolValues(X_test_1) # X_train_2 = randomProtocolValues(X_train_2) # X_test_2 = randomProtocolValues(X_test_2) # X_train_3 = randomProtocolValues(X_train_3) # X_test_3 = randomProtocolValues(X_test_3) # X_train_4 = randomProtocolValues(X_train_4) # X_test_4 = randomProtocolValues(X_test_4) # Prepare ensemble method estimators = [] model1 = KNeighborsClassifier(n_neighbors=16, algorithm='ball_tree', metric='canberra', n_jobs=-1) estimators.append(('knn', model1)) model2 = SVC(gamma=0.0078125, C=8192, probability=False) estimators.append(('svmrbf', model2)) model3 = DecisionTreeClassifier() #max_depth=50) estimators.append(('DecisionTree', model3)) model4 = RandomForestClassifier(n_estimators=100, oob_score=True, n_jobs=-1) estimators.append(('RandomForest', model4)) model5 = XGBClassifier(max_depth=10, n_estimators=100, learning_rate=0.1) estimators.append(('XGBoost', model5)) # ensemble = VotingClassifier(estimators,voting='hard') ensemble = CategoryClassifier() # CategoricalEnsembleVoting(X_train_0, y_train_0, X_test_0, y_test_0) oneFold(X_train_0, y_train_0, X_test_0, y_test_0, ensemble) oneFold(X_train_1, y_train_1, X_test_1, y_test_1, ensemble) oneFold(X_train_2, y_train_2, X_test_2, y_test_2, ensemble) oneFold(X_train_3, y_train_3, X_test_3, y_test_3, ensemble) oneFold(X_train_4, y_train_4, X_test_4, y_test_4, ensemble)
data_x, data_y = data[0][:1000, :5], data[1][:1000] data_y_binary = (data_y > 5).astype(np.int32) print("Binary classification") print("training model") model = xgboost.XGBClassifier(n_estimators=10) model.fit(data_x, data_y_binary) features = ["f{0}".format(i) for i in range(data_x.shape[1])] target_names = [ "cls{0}".format(i) for i in range(len(np.unique(data_y_binary))) ] bdt = BDTxgboost(model, features, target_names) bdt.to_tmva("test.xml") bdt.setup_tmva("test.xml") d1 = 0.0 for irow in range(data_x.shape[0]): predA1 = bdt.eval_tmva(data_x[irow, :]) predB1 = bdt.eval(data_x[irow, :]) d1 += np.abs((predA1 - predB1) / predA1) if __name__ == "__main__": print("fetching data") data = load_svmlight_files(("usps", "usps.t")) #simple_test_xgboost() unittest.main()
def pCoverX(featureFamily): os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train") path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\" data_df = pd.DataFrame() n_guass = 2 train_post_array = [] test_post_array = [] val_post_array = [] train_entropy_array = [] test_entropy_array = [] val_entropy_array = [] fileType = featureFamily + '*.gz' for file in glob.glob(fileType): print(file) X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files( (gzip.open(path + "train\\" + file), gzip.open(path + "test\\" + file), gzip.open(path + "validation\\" + file))) #X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt")) X_train = X_train[y_train != 31] X_test = X_test[y_test != 31] X_val = X_val[y_val != 31] y_train = y_train[y_train != 31] y_test = y_test[y_test != 31] y_val = y_val[y_val != 31] #========================= Feature Selection using Variance Thresold ============================================================= X_train_new, X_test_new, X_val_new = featureSelection(X_train, X_test, X_val, y_train, log=True, tech='LinearSVC') #========================= Mixture of Guassian ============================================================ train_prob, test_prob, val_prob = pXoverC(X_train_new, y_train, X_test_new, y_test, X_val_new, y_val, n_guass) #========================= Calculating Prior, Posterior and Entropy ============================================================ prr = prior(y_train) train_post = posterior(train_prob, prr) train_entropy = entropy(train_post) train_post_array.append(train_post) train_entropy_array.append(train_entropy) test_post = posterior(test_prob, prr) test_entropy = entropy(test_post) test_post_array.append(test_post) test_entropy_array.append(test_entropy) val_post = posterior(val_prob, prr) val_entropy = entropy(val_post) val_post_array.append(val_post) val_entropy_array.append(val_entropy) train_acc, c_mat = checkAccuracy(train_post, y_train) test_acc, c_mat = checkAccuracy(test_post, y_test) val_acc, c_mat = checkAccuracy(val_post, y_val) temp = pd.DataFrame([[file, train_acc, test_acc, val_acc]]) data_df = data_df.append(temp, ignore_index=True) return train_post_array, test_post_array, val_post_array, train_entropy_array, test_entropy_array, val_entropy_array, data_df
def svm_skin(X_train, y_train, X_test, y_test): """Learn the skin data sets with SVM with Linear kernel. X_*: Samples. y_*: labels. """ print 'SVM w/ Linear kernel' clf = svm.LinearSVC() clf.fit(X_train, y_train) score = 100 * clf.score(X_test.toarray(), y_test) print 'SVM score: %.2f%%' % score return score if __name__ == '__main__': # `data_size` is an integer which controls how big the data set is. # Use none for to use the whole dataset. # split_libsvm_dataset(path='skin.txt', data_size=None) # Load train and test samples (X) + labels (y). X_train, y_train, X_test, y_test = load_svmlight_files( ('skin-train.libsvm', 'skin-test.libsvm')) svm_skin(X_train, y_train, X_test, y_test) # iterations, scores = adaboost_skin(X_train, y_train, X_test, y_test) # graph = plot_success_per_size(iterations, scores) # show()
for key in data: print('{"%s":"%s"}' % (key, data[key])) write_dict({'pca_file': 'pca_plot.png'}) import datetime print(datetime.datetime.now()) from numpy import genfromtxt print("feature_file:", feature_file) if '.csv' in feature_file: X = genfromtxt(feature_file, delimiter=',') elif '.libsvm' in feature_file: X, y = datasets.load_svmlight_files([feature_file]) X = X.toarray() # y = y.toarray() svc = sc.load_model(args.model_file) if svc == None: svc = svm.SVC(C=args.C, kernel=args.kernel, degree=args.degree, gamma=args.gamma, coef0=args.coef0, shrinking=args.shrinking, probability=args.probability, tol=args.tol, cache_size=args.cache_size,
=================================================== In this example we show how to handle LIBSVM file format. """ from sklearn.datasets import load_svmlight_files import sklearn.metrics import jubakit from jubakit.classifier import Classifier, Dataset, Config # Load LIBSVM files. # Note that these example files are not included in this repository. # You can fetch them from: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#news20 print("Loading LIBSVM files...") (train_X, train_y, test_X, test_y) = load_svmlight_files(['news20', 'news20.t']) # Create a Train Dataset. print("Creating train dataset...") train_ds = Dataset.from_matrix(train_X, train_y) # Create a Test Dataset print("Creating test dataset...") test_ds = Dataset.from_matrix(test_X, test_y) # Create a Classifier Service classifier = Classifier.run(Config()) # Train the classifier. print("Training...") for (idx, _) in classifier.train(train_ds):
# import pdb # pdb.set_trace() # print "Features: ", len(data["data"][0]) # print "Instances: ", len(data["data"]) # print len(set(data["target"])) data = datasets.load_mlcomp() print "Features: ", len(data["data"][0]) print "Instances: ", len(data["data"]) import pdb pdb.set_trace() data = datasets.load_sample_image() print "Features: ", len(data["data"][0]) print "Instances: ", len(data["data"]) print len(set(data["target"])) data = datasets.load_sample_images() print "Features: ", len(data["data"][0]) print "Instances: ", len(data["data"]) print len(set(data["target"])) data = datasets.load_svmlight_file() print "Features: ", len(data["data"][0]) print "Instances: ", len(data["data"]) print len(set(data["target"])) data = datasets.load_svmlight_files() print "Features: ", len(data["data"][0]) print "Instances: ", len(data["data"]) print len(set(data["target"]))