Exemplo n.º 1
0
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    for X in (Xs, Xd):
        for zero_based in (True, False):
            for dtype in [np.float32, np.float64]:
                f = BytesIO()
                # we need to pass a comment to get the version info in;
                # LibSVM doesn't grok comments so they're not put in by
                # default anymore.
                dump_svmlight_file(X.astype(dtype), y, f, comment="test",
                                   zero_based=zero_based)
                f.seek(0)

                comment = f.readline()
                assert_in("scikit-learn %s" % sklearn.__version__, comment)
                comment = f.readline()
                assert_in(["one", "zero"][zero_based] + "-based", comment)

                X2, y2 = load_svmlight_file(f, dtype=dtype,
                                            zero_based=zero_based)
                assert_equal(X2.dtype, dtype)
                if dtype == np.float32:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype), X2.toarray(), 4)
                else:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype), X2.toarray(), 15)
                assert_array_equal(y, y2)
Exemplo n.º 2
0
 def train(self, examples, outDir, parameters, classifyExamples=None, dummy=False):
     outDir = os.path.abspath(outDir)
     
     examples = self.getExampleFile(examples, dummy=dummy)
     classifyExamples = self.getExampleFile(classifyExamples, dummy=dummy)
     
     # Return a new classifier instance for following the training process and using the model
     classifier = copy.copy(self)
     classifier.parameters = parameters
     classifier._filesToRelease = [examples, classifyExamples]
     
     if not os.path.exists(outDir):
         os.makedirs(outDir)
     
     trainFeatures, trainClasses = datasets.load_svmlight_file(examples)
     if classifyExamples != None:
         develFeatures, develClasses = datasets.load_svmlight_file(classifyExamples, trainFeatures.shape[1])
     binarizer = preprocessing.LabelBinarizer()
     binarizer.fit(trainClasses)
     trainClasses = binarizer.transform(trainClasses)
     if classifyExamples != None:
         develClasses = binarizer.transform(develClasses)
     
     print >> sys.stderr, "Training Keras model with parameters:", parameters
     parameters = Parameters.get(parameters, {"TEES.classifier":"KerasClassifier", "layers":5, "lr":0.001, "epochs":1, "batch_size":64, "patience":10})
     np.random.seed(10)
     classifier.kerasModel = classifier._defineModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses)
     classifier._fitModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses)
def gridSearch():
	
	X_train, y_train = load_svmlight_file(svmPath + "/" + trainFile)
	X_test, y_test = load_svmlight_file(svmPath + "/" + testFile, n_features=X_train.shape[1])

	
	tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}]#, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

	#training
#	clf = svm.SVC(kernel='linear')
#	clf.fit(X_features, trainingLabels)	

	scores = ['precision', 'recall']

	for score in scores:
		print("# Tuning hyper-parameters for %s" % score)
		print()

    	clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score)
    	clf.fit(X_train, y_train)
    	print("Best parameters set found on development set:")
    	print()
    	print(clf.best_estimator_)
    	print()
    	print("Grid scores on development set:")
    	print()
    	for params, mean_score, scores in clf.grid_scores_:
    		print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))
    		print()
    		print("Detailed classification report:")
    		print()
    		print("The model is trained on the full development set.")
    		print("The scores are computed on the full evaluation set.")
    		print()
Exemplo n.º 4
0
def scale_mnist8m():
    from sklearn.datasets import load_svmlight_file


    print "loading train",datetime.datetime.now()
    dd_train = load_svmlight_file(base_folder_mnist + "mnist8m_6_8_train.libsvm")
    print "loading test", datetime.datetime.now()
    dd_test = load_svmlight_file(base_folder_mnist + "mnist8m_6_8_test.libsvm")

    Xtrain = dd_train[0]
    Xtest = dd_test[0]
    Ytrain = dd_train[1]
    Ytest = dd_test[1]

    Xtrain = csr_matrix((Xtrain.data, Xtrain.indices, Xtrain.indptr), shape=(Xtrain.shape[0], 786))
    Xtest = csr_matrix((Xtest.data, Xtest.indices, Xtest.indptr), shape=(Xtest.shape[0], 786))
    from sklearn.externals import joblib


    print "densifying train",datetime.datetime.now()
    Xtrain = Xtrain.todense()
    print "densifying test",datetime.datetime.now()
    Xtest = Xtest.todense()

    print "dumping train",datetime.datetime.now()
    joblib.dump((np.asarray(Xtrain),Ytrain),base_folder_mnist + "mnist8m_6_8_train_reshaped")
    #joblib.load(base_folder + "mnist8m_6_8_train_touple_small")
    print "dumping test",datetime.datetime.now()
    joblib.dump((np.asarray(Xtest),Ytest),base_folder_mnist + "mnist8m_6_8_test_reshaped")
    print "finished",datetime.datetime.now()
Exemplo n.º 5
0
Arquivo: svm.py Projeto: lkprof/sema
def test():
    x_train,y_train=load_svmlight_file("D:/traindata/12trainset")
    x_train.todense()
    x_test,y_test=load_svmlight_file("D:/traindata/12testset")
    x_test.todense()
    print(x_train.shape)
    #classifier
    clf=SVC(kernel='rbf')
    ovrclf=OneVsRestClassifier(clf,-1)
    #parameter
    parameters=[{'estimator__C':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5],
                 'estimator__kernel':['rbf'],
                 'estimator__gamma':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5]},
                {'estimator__C':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5],
                 'estimator__kernel':['linear']}]
    para={'estimator__C':[2**-5,2**-4],
                 'estimator__kernel':['rbf'],
                 'estimator__gamma':[2**-1,1]}
    #scoring
    sougou_score=make_scorer(score_func,greater_is_better=False)
    #cross_validation iterator
    sfk=c_v.StratifiedKFold(y_train,shuffle=True,n_folds=5,random_state=0)
    #grid search
    gsclf=g_s.GridSearchCV(ovrclf,param_grid=para,cv=sfk,scoring=sougou_score)
    gsclf.fit(x_train,y_train)
    print("best score: ",gsclf.best_score_)
    print("best parameters: ",gsclf.best_params_)
    y_pred=gsclf.predict(x_test)

    #result
    target_names=['0','1','2','3']
    sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2)
    print(classification_report(y_test,y_pred,target_names=target_names))
    print("sougouVal: ",float(sum_y)/y_pred.shape[0])
    print(time.time()-start_time)
def test_load_with_offsets(sparsity, n_samples, n_features):
    rng = np.random.RandomState(0)
    X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features))
    if sparsity:
        X[X < sparsity] = 0.0
    X = sp.csr_matrix(X)
    y = rng.randint(low=0, high=2, size=n_samples)

    f = BytesIO()
    dump_svmlight_file(X, y, f)
    f.seek(0)

    size = len(f.getvalue())

    # put some marks that are likely to happen anywhere in a row
    mark_0 = 0
    mark_1 = size // 3
    length_0 = mark_1 - mark_0
    mark_2 = 4 * size // 5
    length_1 = mark_2 - mark_1

    # load the original sparse matrix into 3 independent CSR matrices
    X_0, y_0 = load_svmlight_file(f, n_features=n_features,
                                  offset=mark_0, length=length_0)
    X_1, y_1 = load_svmlight_file(f, n_features=n_features,
                                  offset=mark_1, length=length_1)
    X_2, y_2 = load_svmlight_file(f, n_features=n_features,
                                  offset=mark_2)

    y_concat = np.concatenate([y_0, y_1, y_2])
    X_concat = sp.vstack([X_0, X_1, X_2])
    assert_array_almost_equal(y, y_concat)
    assert_array_almost_equal(X.toarray(), X_concat.toarray())
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    for X in (Xs, Xd):
        for zero_based in (True, False):
            for dtype in [np.float32, np.float64]:
                f = BytesIO()
                dump_svmlight_file(X.astype(dtype), y, f, zero_based=zero_based)
                f.seek(0)

                comment = f.readline()
                assert_in("scikit-learn %s" % sklearn.__version__, comment)
                comment = f.readline()
                assert_in(["one", "zero"][zero_based] + "-based", comment)

                X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based)
                assert_equal(X2.dtype, dtype)
                if dtype == np.float32:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype),
                        X2.toarray(),
                        4,
                    )
                else:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype),
                        X2.toarray(),
                        15,
                    )
                assert_array_equal(y, y2)
def test_load_with_long_qid():
    # load svmfile with longint qid attribute
    data = b("""
    1 qid:0 0:1 1:2 2:3
    0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985
    0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985
    3 qid:9223372036854775807  0:1440446648 1:72048431380967004 2:236784985""")
    X, y, qid = load_svmlight_file(BytesIO(data), query_id=True)

    true_X = [[1,          2,                 3],
             [1440446648, 72048431380967004, 236784985],
             [1440446648, 72048431380967004, 236784985],
             [1440446648, 72048431380967004, 236784985]]

    true_y = [1, 0, 0, 3]
    trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807]
    assert_array_equal(y, true_y)
    assert_array_equal(X.toarray(), true_X)
    assert_array_equal(qid, trueQID)

    f = BytesIO()
    dump_svmlight_file(X, y, f, query_id=qid, zero_based=True)
    f.seek(0)
    X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True)
    assert_array_equal(y, true_y)
    assert_array_equal(X.toarray(), true_X)
    assert_array_equal(qid, trueQID)

    f.seek(0)
    X, y = load_svmlight_file(f, query_id=False, zero_based=True)
    assert_array_equal(y, true_y)
    assert_array_equal(X.toarray(), true_X)
def test_dump_comment():
    X, y = load_svmlight_file(datafile)
    X = X.toarray()

    f = BytesIO()
    ascii_comment = "This is a comment\nspanning multiple lines."
    dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False)
    f.seek(0)

    X2, y2 = load_svmlight_file(f, zero_based=False)
    assert_array_almost_equal(X, X2.toarray())
    assert_array_equal(y, y2)

    # XXX we have to update this to support Python 3.x
    utf8_comment = "It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc"
    f = BytesIO()
    assert_raises(UnicodeDecodeError, dump_svmlight_file, X, y, f, comment=utf8_comment)

    unicode_comment = utf8_comment.decode("utf-8")
    f = BytesIO()
    dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False)
    f.seek(0)

    X2, y2 = load_svmlight_file(f, zero_based=False)
    assert_array_almost_equal(X, X2.toarray())
    assert_array_equal(y, y2)

    f = BytesIO()
    assert_raises(ValueError, dump_svmlight_file, X, y, f, comment="I've got a \0.")
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  C, n_fold=5):

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG, filename='lr_{}.log'.format(C))

    logging.info('Loading training and test data...')
    X, y = load_svmlight_file(train_file)
    X_tst, _ = load_svmlight_file(test_file)

    clf = LR(penalty='l2', dual=True, C=C, class_weight='auto',
             random_state=2015)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y)
    lloss = 0.
    for i_trn, i_val in cv:
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        lloss += log_loss(y[i_val], p_val[i_val])

    logging.info('Log Loss = {:.4f}'.format(lloss))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Exemplo n.º 11
0
def classification_subfeature(train, test, outclss):
    fields = iot.read_fields()
    print len(fields)
    foi = ['liwc_anal.result.i',
           'liwc_anal.result.we',
           'liwc_anal.result.affect',
           'liwc_anal.result.posemo',
           'liwc_anal.result.negemo',
           'liwc_anal.result.bio',
           'liwc_anal.result.body',
           'liwc_anal.result.health',
           'liwc_anal.result.ingest']
    indeces = [np.where(fields==f)[0][0] for f in foi]
    print fields[indeces]

    '''Load Training data'''
    X_train, y_train = load_svmlight_file(train)
    X_train = X_train.toarray()[:, indeces]
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    print X_train.shape
    '''Load Test data'''
    X_test, y_test = load_svmlight_file(test)
    X_test = X_test.toarray()[:, indeces]
    X_test = scaler.transform(X_test)
    print X_test.shape

    svc_lin = SVC(kernel='linear', class_weight='balanced')
    y_lin = svc_lin.fit(X_train, y_train).predict(X_test)
    # pickle.dump(y_test, open(outid, 'w'))
    pickle.dump(y_lin, open(outclss, 'w'))
Exemplo n.º 12
0
def train_and_test(domain_dir, sentences):
    train_dir = os.path.join(domain_dir, "train")
    test_dir = os.path.join(domain_dir, "test")
    X_train, y_train = load_svmlight_file(os.path.join(train_dir, "feature_vector"))
    X_test, y_test = load_svmlight_file(os.path.join(test_dir, "feature_vector"))
    clf = LogisticRegression(C=1.0, intercept_scaling=1, dual=False,
                             fit_intercept=True, penalty="l2", tol=0.0001)
    print("fit..")
    clf.fit(X_train, y_train)
    print("fit end...")
    y_train_predict = clf.predict(X_train)
    print(f1_score(y_train, y_train_predict))
    y = clf.predict(X_test)
    f = open(os.path.join(test_dir, "relation.classifier"), "w", encoding="utf8")
    i = 0
    for sentence in sentences:
        flag = False
        str_list = []
        str_list.append("S\t{0}".format(sentence.text))
        for pair in sentence.candidate_relation:
            if y[i] != 0:
                flag = True
                str_list.append("R\t{0}\t{1}\t{2}\t{3}".format(
                    sentence.print_phrase(pair[0]).lower(),
                    sentence.print_phrase(pair[1]).lower(),
                    list(pair[0]),
                    list(pair[1])))
            i += 1
        if flag:
            for s in str_list:
                print(s, file=f)
    f.close()
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_fold=5):

    feature_name = os.path.basename(train_file)[:-10]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='esb_xg_grid_colsub_{}.log'.format(feature_name))

    logging.info('Loading training and test data...')
    X, y = load_svmlight_file(train_file)
    X_tst, _ = load_svmlight_file(test_file)

    xg = xgb.XGBClassifier()
    param = {'learning_rate': [.01, .03, .05], 'max_depth': [4, 5, 6],
             'n_estimators': [400, 600]}
    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)
    clf = GridSearchCV(xg, param, scoring='log_loss', verbose=1, cv=cv)

    logging.info('Cross validation for grid search...')
    clf.fit(X, y)
    p = clf.predict_proba(X)[:, 1]

    logging.info('best model = {}'.format(clf.best_estimator_))
    logging.info('best score = {:.4f}'.format(clf.best_score_))

    logging.info('Retraining with 100% data...')
    clf.best_estimator_.fit(X, y)
    p_tst = clf.best_estimator_.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Exemplo n.º 14
0
 def check_data_compatibility(self):
     try:
         load_svmlight_file(self.input_path)
         return True
     except Exception as ex:
         print ex.message
         return False
Exemplo n.º 15
0
	def load(self, dataset = None, data_dir = "/home/drunkeneye/lab/data", verbose = None):
		if verbose == None:
			verbose = self.verbose
			
		if dataset == None:
			dataset = self.name
		# first try to load the data 'directly'
		try:
			filePath = os.path.join(data_dir, dataset, dataset)
			if verbose:
				print("  Trying to load data set from {}". format(filePath))
			self.X, self.y = load_svmlight_file(filePath)
			self.X = np.asarray(self.X.todense())
			if verbose:
				print ("    Loaded from {}". format( filePath))
			return
		except:
			pass
		
		# next try
		try:
			filePath = os.path.join(data_dir, dataset, dataset + ".combined.scaled")
			if verbose:
				print("  Trying to load data set from {}". format(filePath))
			self.X, self.y = load_svmlight_file(filePath)
			self.X = np.asarray(self.X.todense())
			if verbose:
				print ("    Loaded from {}". format( filePath))
			return 
		except:
			pass
Exemplo n.º 16
0
def run(train_fp, test_fp, pred_fp, key_fp):

	keys = []
	load(key_fp, keys)

	X_train, y_train = load_svmlight_file(train_fp)
	X_test, y_test = load_svmlight_file(test_fp)

	#dtrain = xgb.DMatrix(train_fp)
	#dtest = xgb.DMatrix(test_fp)

	params = {}
	with open("lr_reg.params", 'r') as f:
		params = json.load(f)
	print "[%s] [INFO] params: %s\n" % (t_now(), str(params))

	model = linear_model.Ridge (alpha = params['alpha'])
	model.fit(X_train, y_train)
	pred = model.predict(X_test)
	#model = xgb.train( params, dtrain, params['n_round'])
	#model = xgb.train( params, dtrain, params['n_round'], obj = customed_obj_1)
	#pred = model.predict(dtest, ntree_limit=params['n_round'])
	#pred = model.predict(dtest)

	f = open(pred_fp, 'w')
	for i in range(len(keys)):
		f.write(keys[i] + "," + str(max(1.0, pred[i])) + "\n")
	f.close()

	return 0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est, depth, n_fold=5):

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG, filename='rf_{}_{}.log'.format(
                                                        n_est, depth
                                                       ))

    logging.info('Loading training and test data...')
    X, y = load_svmlight_file(train_file)
    X_tst, _ = load_svmlight_file(test_file)

    clf = RF(n_estimators=n_est, max_depth=depth, random_state=2015)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y)
    lloss = 0.
    for i_trn, i_val in cv:
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        lloss += log_loss(y[i_val], p_val[i_val])

    logging.info('Log Loss = {:.4f}'.format(lloss))

    logging.info('Retraining with 100% data...')
    clf.fit(X.todense(), y)
    p_tst = clf.predict_proba(X_tst.todense())[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Exemplo n.º 18
0
def main():

    # svm_para = {'C': 10.0, 'kernel': 'rbf', 'gamma': 1.667, 'verbose': False}
    # svm_para = {'kernel': 'linear', 'verbose': False}
    # loading data
    # X_train, y_train = datasets.load_svmlight_file(r'./dataset/mnist_train_784_poly_8vr.dat')
    # X_train, y_train = datasets.load_svmlight_file(r'./dataset/covtype_tr_2vr.data')

    # svm_para = {'C': 10.0, 'kernel': 'rbf', 'gamma': 0.00002, 'tol': 0.01, 'verbose': False}

    # census
    svm_para = {"C": 10.0, "kernel": "rbf", "gamma": 1.667, "verbose": False}
    X_train, y_train = datasets.load_svmlight_file(r"./dataset/census.train")

    # test ramdom sampling
    RS_SVM = RandomSamplingSVM(svm_para)
    start_time = time.time()
    model = RS_SVM.train_one_half_v2(X_train, y_train)

    print("Remain SVs: " + str(model.n_support_), flush=True)
    print("--- %s seconds ---" % (time.time() - start_time), flush=True)

    if model is None:
        print("Can not train the dataset", flush=True)
    else:

        # X_test, y_test = datasets.load_svmlight_file(r'./dataset/mnist_test_784_poly_8vr.dat')
        # X_test, y_test = datasets.load_svmlight_file(r'./dataset/covtype_tst_2vr.data')
        X_test, y_test = datasets.load_svmlight_file(r"./dataset/census.train")
        ratio = model.score(X_test, y_test)
        print(ratio)
        print("--- %s seconds ---" % (time.time() - start_time), flush=True)
Exemplo n.º 19
0
def load_data(dataset1, dataset2=None, make_dense=False):
    """Loads the dataset(s) given in the the svmlight / libsvm format

    **Parameters**

    * dataset1 (*str*) - Path to the file of the first dataset.
    * dataset2 (*str or None*) - If not None, path to the file of second dataset
    * make_dense (*boolean*) - Whether to return dense matrices instead of sparse ones

    **Returns**

    * (X_pool, X_test, y_pool, y_test) - Pool and test files if two files are provided
    * (X, y) - The single dataset

    """
    if dataset2:
        X_pool, y_pool = load_svmlight_file(dataset1)
        _, num_feat = X_pool.shape
        X_test, y_test = load_svmlight_file(dataset2, n_features=num_feat)
        if make_dense:
            X_pool = X_pool.todense()
            X_test = X_test.todense()
        return (X_pool, X_test, y_pool, y_test) 

    else:
        X, y = load_svmlight_file(dataset1)
        if make_dense:
            X = X.todense()
        return X, y
Exemplo n.º 20
0
Arquivo: svm.py Projeto: lkprof/sema
def svm():
    #load data
    x_train,y_train=load_svmlight_file("12trainset")
    x_train.todense()
    x_test,y_test=load_svmlight_file("12testdata")
    x_test.todense()
    sk=SelectKBest(f_classif,9).fit(x_train,y_train)
    x_new=sk.transform(x_train)
    x_newtest=sk.transform(x_test)
    print(sk.scores_)
    print(x_new.shape)
    print(sk.get_support())
    #classfier
    clf=SVC(C=2,gamma=2)
    ovrclf=OneVsRestClassifier(clf,-1)
    ovrclf.fit(x_train,y_train)
    y_pred=ovrclf.predict(x_test)
    # write result
    with open("result.txt","w") as fw:
        for st in y_pred.tolist():
            fw.write(str(st)+'\n')
    print(np.array(y_pred).shape)

    target_names=['0','1','2','3']
    #result
    #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2)
    #print(classification_report(y_test,y_pred,target_names=target_names))
    #print("sougouVal: ",float(sum_y)/y_pred.shape[0])
    print(time.time()-start_time)
def test_load_compressed():
    X, y = load_svmlight_file(datafile)

    with NamedTemporaryFile(prefix="sklearn-test", suffix=".gz") as tmp:
        tmp.close()  # necessary under windows
        with open(datafile, "rb") as f:
            with gzip.open(tmp.name, "wb") as fh_out:
                shutil.copyfileobj(f, fh_out)
        Xgz, ygz = load_svmlight_file(tmp.name)
        # because we "close" it manually and write to it,
        # we need to remove it manually.
        os.remove(tmp.name)
    assert_array_almost_equal(X.toarray(), Xgz.toarray())
    assert_array_almost_equal(y, ygz)

    with NamedTemporaryFile(prefix="sklearn-test", suffix=".bz2") as tmp:
        tmp.close()  # necessary under windows
        with open(datafile, "rb") as f:
            with BZ2File(tmp.name, "wb") as fh_out:
                shutil.copyfileobj(f, fh_out)
        Xbz, ybz = load_svmlight_file(tmp.name)
        # because we "close" it manually and write to it,
        # we need to remove it manually.
        os.remove(tmp.name)
    assert_array_almost_equal(X.toarray(), Xbz.toarray())
    assert_array_almost_equal(y, ybz)
Exemplo n.º 22
0
def train_predict_lr_cv(train_file, test_file, predict_train_file,
                        predict_test_file, c, n_fold=10):
    logger.info("Reading in the training data")
    X_trn, y_trn = load_svmlight_file(train_file)
    X_trn = X_trn.todense()

    logger.info("Reading in the test data")
    X_tst, _ = load_svmlight_file(test_file)
    X_tst = X_tst.todense()
    
    logger.info('Normalizing data')
    scaler = StandardScaler()
    X_trn = scaler.fit_transform(X_trn)
    X_tst = scaler.transform(X_tst)
 
    cv = cross_validation.StratifiedKFold(y_trn, n_folds=n_fold, shuffle=True,
                                          random_state=1)

    yhat_tst = np.zeros((X_tst.shape[0], ))
    yhat_trn = np.zeros((X_trn.shape[0], ))
    for i, (i_trn, i_val) in enumerate(cv, start=1):
        logger.info('Training CV #{}'.format(i))
        clf = LogisticRegression(C=c, class_weight=None, random_state=2013)
        clf.fit(X_trn[i_trn], y_trn[i_trn])

        yhat_trn[i_val] = clf.predict_proba(X_trn[i_val])[:, 1]
        yhat_tst += np.array(clf.predict_proba(X_tst)[:, 1]) / n_fold

    auc_cv = metrics.roc_auc_score(y_trn, yhat_trn)
    logger.info('AUC CV: {}'.format(auc_cv))
    logger.info("Writing test predictions to file")
    np.savetxt(predict_train_file, yhat_trn, fmt='%.6f', delimiter=',')
    np.savetxt(predict_test_file, yhat_tst, fmt='%.6f', delimiter=',')
Exemplo n.º 23
0
def loadData():
    data1, target = load_svmlight_file('dataset/text.scale')
    data2, target = load_svmlight_file('dataset/following.scale')

    data1, data2, target = shuffle(data1, data2, target)

    return (data1, data2, target)
Exemplo n.º 24
0
def load_covtype():
    try:
        x, y = da.load_svmlight_file("data/covtype/covtype.sample04_train", 54)
        x_test, y_test = da.load_svmlight_file("data/covtype/covtype.sample04_test", 54)
    except(Exception):
        x, y = da.load_svmlight_file("../data/covtype/covtype.sample04_train", 54)
        x_test, y_test = da.load_svmlight_file("../data/covtype/covtype.sample04_test", 54)
    return x, x_test, y, y_test
Exemplo n.º 25
0
def drop_fn(train_file,drop_file):
    x_train,y_train=load_svmlight_file(train_file)
    x_fn,y_fn=load_svmlight_file(drop_file,n_features=x_train.shape[1])
    iterations = 0
    while 1:
        print 'iteration:%d'%iterations
        iterations += 1
        train_set=update_model((x_train,y_train),(x_fn,y_fn))
Exemplo n.º 26
0
def nn_classify():
    # train_X,Y = load_svmlight_file('data/train_metrix')
    # rows = pd.read_csv('data/log_test2.csv',index_col=0).sort_index().index.unique()
    # train_X = pd.read_csv('data/train_tfidf.csv',index_col=0)
    # test_X = pd.read_csv('data/test_tfidf.csv',index_col=0)
    # select = SelectPercentile(f_classif, percentile=50)
    # select.fit(train_X,Y)
    # train_X = select.transform(train_X)
    # test_X = select.transform(test_X)
    # print 'dump train...'
    # dump_svmlight_file(train_X,Y,'data/train_last')
    # test_Y = [0]*(test_X.shape[0])
    # print 'dump test...'
    # dump_svmlight_file(test_X,test_Y,'data/test_last')

    train_X,Y = load_svmlight_file('data/train_last')
    test_X,test_Y = load_svmlight_file('data/test_last')
    train_X = train_X.toarray()
    test_X = test_X.toarray()
    Y = [int(y)-1 for y in Y]
    print 'Y:',len(Y)
    rows = pd.read_csv('data/log_test2.csv',index_col=0).sort_index().index.unique()
    train_n = train_X.shape[0]
    m = train_X.shape[1]
    test_n = test_X.shape[0]
    print train_n,m,#test_n
    train_data = ClassificationDataSet(m,1,nb_classes=12)
    test_data = ClassificationDataSet(m,1,nb_classes=12)
    # test_data = ClassificationDataSet(test_n,m,nb_classes=12)
    for i in range(train_n):
        train_data.addSample(np.ravel(train_X[i]),Y[i])
    for i in range(test_n):
        test_data.addSample(test_X[i],Y[i])
    trndata = train_data
    # tstdata = train_data

    trndata._convertToOneOfMany()
    # tstdata._convertToOneOfMany()
    test_data._convertToOneOfMany()

     # 先用训练集训练出所有的分类器
    print 'train classify...'
    fnn = buildNetwork( trndata.indim, 400 , trndata.outdim, outclass=SoftmaxLayer )
    trainer = BackpropTrainer( fnn, dataset=trndata, momentum=0.1, learningrate=0.01 , verbose=True, weightdecay=0.01)
    trainer.trainEpochs(3)
    # print 'Percent Error on Test dataset: ' , percentError( trainer.testOnClassData (
    #            dataset=tstdata )
    #            , )
    print 'end train classify'
    pre_y = trainer.testOnClassData(dataset=trndata)
    print metrics.classification_report(Y,pre_y)
    pre_y = trainer.testOnClassData(dataset=test_data)
    print 'write result...'
    print 'before:',pre_y[:100]
    pre_y = [int(y)+1 for y in pre_y]
    print 'after:',pre_y[:100]
    DataFrame(pre_y,index=rows).to_csv('data/info_test2.csv', header=False)
    print 'end...'
Exemplo n.º 27
0
def load_train_test():

    current_path = os.path.abspath(
            os.path.join(os.getcwd(), os.pardir))

    train, train_labels = load_svmlight_file(current_path + "/Data/Processed/TrainSet.svm")
    test, test_labels = load_svmlight_file(current_path + "/Data/Processed/TestSet.svm")

    return train, test, train_labels, test_labels
def test_dump():
    X_sparse, y_dense = load_svmlight_file(datafile)
    X_dense = X_sparse.toarray()
    y_sparse = sp.csr_matrix(y_dense)

    # slicing a csr_matrix can unsort its .indices, so test that we sort
    # those correctly
    X_sliced = X_sparse[np.arange(X_sparse.shape[0])]
    y_sliced = y_sparse[np.arange(y_sparse.shape[0])]

    for X in (X_sparse, X_dense, X_sliced):
        for y in (y_sparse, y_dense, y_sliced):
            for zero_based in (True, False):
                for dtype in [np.float32, np.float64, np.int32]:
                    f = BytesIO()
                    # we need to pass a comment to get the version info in;
                    # LibSVM doesn't grok comments so they're not put in by
                    # default anymore.

                    if (sp.issparse(y) and y.shape[0] == 1):
                        # make sure y's shape is: (n_samples, n_labels)
                        # when it is sparse
                        y = y.T

                    dump_svmlight_file(X.astype(dtype), y, f, comment="test",
                                       zero_based=zero_based)
                    f.seek(0)

                    comment = f.readline()
                    comment = str(comment, "utf-8")

                    assert_in("scikit-learn %s" % sklearn.__version__, comment)

                    comment = f.readline()
                    comment = str(comment, "utf-8")

                    assert_in(["one", "zero"][zero_based] + "-based", comment)

                    X2, y2 = load_svmlight_file(f, dtype=dtype,
                                                zero_based=zero_based)
                    assert_equal(X2.dtype, dtype)
                    assert_array_equal(X2.sorted_indices().indices, X2.indices)

                    X2_dense = X2.toarray()

                    if dtype == np.float32:
                        # allow a rounding error at the last decimal place
                        assert_array_almost_equal(
                            X_dense.astype(dtype), X2_dense, 4)
                        assert_array_almost_equal(
                            y_dense.astype(dtype), y2, 4)
                    else:
                        # allow a rounding error at the last decimal place
                        assert_array_almost_equal(
                            X_dense.astype(dtype), X2_dense, 15)
                        assert_array_almost_equal(
                            y_dense.astype(dtype), y2, 15)
Exemplo n.º 29
0
 def test_lambdarank(self):
     X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
     X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test'))
     q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
     q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query'))
     gbm = lgb.LGBMRanker()
     gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
             eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False,
             callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_iter=100, hidden=4, lrate=.1, n_fold=5):

    _, y_val = load_svmlight_file(train_file)

    cv = StratifiedKFold(y_val, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y_val)
    lloss = 0.
    for i_trn, i_val in cv:
        clf = NN(n=10000, h=hidden, a=lrate, seed=2015)

        logging.info('Epoch\tTrain\tValid')
        logging.info('=========================')
        for i_iter in range(n_iter):
            lloss_trn = 0.
            cnt_trn = 0
            for i, (x, y) in enumerate(clf.read_sparse(train_file)):
                if i in i_val:
                    p_val[i] = clf.predict(x)
                else:
                    p = clf.predict(x)
                    clf.update(x, p - y)
                    lloss_trn += logloss(y, p)
                    cnt_trn += 1

            lloss_trn /= cnt_trn
            lloss_val = log_loss(y_val[i_val], p_val[i_val])

            if (i_iter == 0) or ((i_iter + 1) % int(n_iter / 10) == 0) or (i_iter == n_iter - 1):
                logging.info('#{:4d}\t{:.4f}\t{:.4f}'.format(i_iter + 1,
                                                             lloss_trn,
                                                             lloss_val))

        lloss += lloss_val

    logging.info('Log Loss = {:.4f}'.format(lloss / n_fold))

    logging.info('Retraining with 100% data...')
    clf = NN(n=10000, h=hidden, a=lrate, seed=2015)
    for i_iter in range(n_iter):
        for x, y in clf.read_sparse(train_file):
            p = clf.predict(x)
            clf.update(x, p - y)

        logging.info('#{:4d}'.format(i_iter + 1))

    _, y_tst = load_svmlight_file(test_file)
    p_tst = np.zeros_like(y_tst)
    for i, (x, _) in enumerate(clf.read_sparse(test_file)):
        p_tst[i] = clf.predict(x)

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
Exemplo n.º 31
0
def test_load_svmlight_file_multilabel():
    X, y = load_svmlight_file(multifile, multilabel=True)
    assert_equal(y, [(0, 1), (2,), (1, 2)])
    return train_face, train_face_number, test_face, test_face_number  # tuple


def resizeSVHDShape(matrix):
    svhd = np.zeros((5000, 3072))
    [rows, cols] = svhd.shape
    for r in range(rows):
        for c in range(cols):
            svhd[r][c] = matrix[int(
                (c % 1024) / 32)][(c % 1024) % 32][int(c / 1024)][r]
    return svhd


if __name__ == "__main__":

    x_train2, y_train2 = ds.load_svmlight_file(
        'F:/projects/vec2vec/data-clear-xlren/data-clear/movie/train.bow')

    x_train = x_train2.toarray()
    y_train = y_train2

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    x_train = x_train[0:2000, :]
    y_train = y_train[0:2000]

    models = []
    emb_size = 64
    num_neighbors = 16
    print(x_train.shape)
Exemplo n.º 33
0
def sensorless(path_sensorless):
	from sklearn.datasets import load_svmlight_file
	data = load_svmlight_file(path_sensorless+"/Sensorless.scale")
	dense_vector = np.zeros((data[0].shape[0],data[0].shape[1]))
	data[0].toarray(out = dense_vector)
	return dense_vector, data[1]
Exemplo n.º 34
0
# kafkaproducer.start()

# import os
# import shutil
# path = r'/home/sunbite/video/action_youtube_naudio'
# for dirpath,dirnames,filenames in os.walk(path):
#     for filename in filenames:
#         print(os.path.join(dirpath, filename))
# my_feature = GetFeatures.get_features('v_shooting_01_01_0.jpg')
# print(my_feature)
# ssh = paramiko.SSHClient()
# ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# ssh.connect(hostname='10.3.11.131', username='******', password='******')
# stdin, stdout, stderr = ssh.exec_command('who')
# print()
# print(stdout.read())
# vd = VideoDetector.VideoDetector("/home/sunbite/video/action_youtube_naudio/basketball/v_shooting_01_01.avi","/home/sunbite/Co_KNN_SVM_TMP/CoKNNSVM.model")
# print(vd.getLabel())
x, y = datasets.load_svmlight_file("/home/sunbite/dataset/dataset")
print(x.todense())
print(y)

train_x, test_x, train_y, test_y = train_test_split(x.todense(),
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    shuffle=False)
print(len(train_x))
print(len(test_x))
print(len(train_y))
print(len(test_y))
X = ssp.hstack([
    # X,
    X_char_1,
    X_char_2,
    X_char_3,
    X_char_1_q,
    X_char_2_q,
    X_char_3_q,
    # X_char_4_5_6_q,
    # sim_char_2,
    # sim_char_3,
]).tocsr()

dump_svmlight_file(X, y, path + 'data.svm')

data, y_all = load_svmlight_file(path + 'data.svm')
y_all = y
data = X
del X
del y

X = data[:len_train]
y = y_all[:len_train]
X_t = data[len_train:]
del data
del y_all


def make_mf_lr(X, y, clf, X_test, n_round=3):
    n = X.shape[0]
    '''
Exemplo n.º 36
0
def test_load_invalid_file():
    load_svmlight_file(invalidfile)
Exemplo n.º 37
0
def get_data():
    data = load_svmlight_file("housing_scale")
    return data[0], data[1]
Exemplo n.º 38
0
def test_invalid_filename():
    load_svmlight_file("trou pic nic douille")
from sklearn.datasets import load_svmlight_file
feature_vectors, targets = load_svmlight_file("training_data_file.IDF")
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB , BernoulliNB
from sklearn.neighbors  import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm  import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif


classifiers = ['MultinomialNB', 'BernoulliNB', 'KNeighborsClassifier', 'SVC']

for n,clf in enumerate([ MultinomialNB(), BernoulliNB(), KNeighborsClassifier(),SVC()]):

    chi2_scores = []
    mutual_info_scores = []
    for k in range(100, 5000, 100):

        X_new1 = SelectKBest(chi2, k=k).fit_transform(feature_vectors, targets)
        X_new2 = SelectKBest(mutual_info_classif, k=k).fit_transform(feature_vectors, targets)
        print(classifiers[n] + ' Chi Squared')
        scores = cross_val_score(clf, X_new1, targets, cv=5, scoring='f1_macro', verbose =0)
        chi2_scores.append(scores.mean())
        print("F1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


        print(classifiers[n] + ' Mutual Information')
        scores = cross_val_score(clf, X_new2, targets, cv=5, scoring='f1_macro', verbose =0)
        mutual_info_scores.append(scores.mean())
        print("F1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Exemplo n.º 40
0
from __future__ import absolute_import

import numpy as np

from sklearn.datasets import load_svmlight_file

from rrf import RRF

DATA_DIR = "example_data"
N_RUNS = 10

if __name__ == '__main__':

    print("=========== Test classification on cod-rna data ===========")
    # X in [-1, 1]
    X, y = load_svmlight_file(DATA_DIR + "/cod-rna.scale", n_features=8)
    X = X.toarray()

    score = np.zeros(N_RUNS)
    train_time = np.zeros(N_RUNS)
    test_time = np.zeros(N_RUNS)

    model = [None] * N_RUNS

    for r in range(N_RUNS):
        idx = np.random.permutation(X.shape[0])

        c = RRF(loss='hinge',
                task="classification",
                learning_rate=0.003,
                learning_rate_gamma=0.0001,
Exemplo n.º 41
0
accuracy = []
train_accu = []
for i in range(n):
    binary = False
    if nclasses[i] == 2:
        binary = True
    feature_zbased = False
    if 'HIGGS' in datasets[i] or 'rna' in datasets[i]:
        feature_zbased = True
    model_name = '{}/models/rxgb/{}/{}_rxgb.model'.format(home, datasets[i], datasets[i])
    file_name = '{}/data/rxgb/{}_test.svm'.format(home, datasets[i])
    bst = xgb.Booster()
    bst.load_model(model_name)
    binary = binary
    model = xgboost_wrapper(bst, binary=binary)	
    test_data, test_labels = load_svmlight_file(file_name, n_features[i])
    test_data = test_data.toarray()
    test_labels = test_labels.astype('int')
    y = model.predict(test_data)
    temp = pd.DataFrame()
    temp['true'] = test_labels
    temp['pred'] = y
    correct = 0
    correct_classified = []
    for j in range(temp.shape[0]):
        if temp.iloc[j]['true'] == temp.iloc[j]['pred']:
            correct = correct + 1
            correct_classified.append(j)           
    selected = random.sample(correct_classified, min(sample_size[i], correct))
    accu = correct / temp.shape[0]
    accuracy.append(accu)
Exemplo n.º 42
0
from sklearn.datasets import load_svmlight_file
from sklearn.externals import joblib
from sklearn.svm import LinearSVC
import random
import pickle
import numpy as np

X_train, Y_train = load_svmlight_file("../datasets/news20/news20.binary.bz2")

print(X_train.shape)
print(Y_train.shape)

## randomly pick training instances
trainIndices = np.array(random.sample(range(0, 19996), 100))
allIndices = np.array([i for i in range(19996)])

## the rest are testing instances
testIndices = np.setdiff1d(allIndices, trainIndices)

print("testing number", testIndices.shape)

## loads training instances
trainInstances = X_train[trainIndices]
trainLabels = Y_train[trainIndices]

## loads testing instances
testInstances = X_train[testIndices]
testLabels = Y_train[testIndices]

## dumps the files
pickle.dump(testIndices, open("testIndices.pkl", "wb"))
Exemplo n.º 43
0
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 29 15:29:57 2018

@author: MiaoWangqian
"""


from sklearn import datasets
from sklearn.model_selection import train_test_split
from scipy import sparse
import numpy as np
import time
#%%
filename = "D:/UCD_STA141C/hw1/news20.binary.bz2"
X,y = datasets.load_svmlight_file(filename)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train = sparse.csr_matrix(y_train).T
y_test = sparse.csr_matrix(y_test).T
omega = np.random.randn(1355191).reshape(1355191,1)
omega = sparse.csr_matrix(omega)
lamda = 1
#%%
#gradient
def h(x):
    return 1/(1+np.exp(x))

def Gradient(X,y,omega,lamda):
    a = np.array(sparse.csr_matrix.todense(X@omega))
    b = np.array(sparse.csr_matrix.todense(y))
    g =  [email protected]_matrix(-b*h(b*a))+ lamda*omega
Exemplo n.º 44
0
def test_not_a_filename():
    # in python 3 integers are valid file opening arguments (taken as unix
    # file descriptors)
    load_svmlight_file(.42)
Exemplo n.º 45
0
def get_data(name):
    data = load_svmlight_file(name)
    return data[0], data[1]
Exemplo n.º 46
0
Arquivo: exp1.py Projeto: Echo-uu/KDD
import math
import random
import numpy as np
import pandas as pd
from sympy import symbols, diff
from sklearn import datasets as ds
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file
import matplotlib.pyplot as plt

x, y = load_svmlight_file("F:/data/Experiment/exp1/housing_scale")

X = x.todense()#transfer sparse matrix to dense matrix

#adding a column(1) ahead of data 
one = np.mat(np.ones((506,1)))
X = np.c_[one, X]

#split the dataset
x_train, x_test, y_train, y_test = train_test_split(X , y, test_size = 0.25, shuffle= False)

y_train = (np.mat(y_train)).T
y_test = (np.mat(y_test)).T

#求参数w
def calW(x, y):
    return (x.T * x).I * (x.T * y)

#损失函数
def loss(x, y, w):
    return ((y - x * w).T * (y - x * w)) / 2
Exemplo n.º 47
0
from sklearn import datasets
from scipy.spatial.distance import pdist, squareform
import scipy as scip
import numpy as np
import math

SPLICE_LOCATION = "/home/anirudhan/workspace/foudnations-of-machine-learning/hw2/libsvm-3.20/tools/splice_hw/"
DEGREE = 3

sigma = math.sqrt(1 / (2 * 0.03125))
negSigmaSq = sigma * sigma * -1

X, Y = datasets.load_svmlight_file(SPLICE_LOCATION +
                                   "splice_noise_train.txt.scale")
X = X.toarray()

gamma = 1.0 / X.shape[1]
pairwise_dists = squareform(pdist(X, 'euclidean'))

k_g = scip.exp(pairwise_dists**2 / negSigmaSq)

k_p = np.dot(X, X.T)
k_p = np.multiply(k_p, gamma)
k_p = np.power(k_p, DEGREE)

k_sum = k_g + k_p

Xt, Yt = datasets.load_svmlight_file(SPLICE_LOCATION +
                                     "splice_noise_test.txt.scale")
Xt = Xt.toarray()
Exemplo n.º 48
0
def get_data(file_input, separator='\t'):
    if 'libsvm' not in file_input:
        file_input = other2libsvm(file_input, separator)
    data = datasets.load_svmlight_file(file_input)
    return data[0], data[1]
Exemplo n.º 49
0
def test_load_zero_based():
    f = BytesIO("-1 4:1.\n1 0:1\n")
    load_svmlight_file(f, zero_based=False)
Exemplo n.º 50
0
def get_malicious():
    data = load_svmlight_file(DIR_PREFIX + "adult.libsvm")
    return _scale(data[0].todense()), data[1]
def get_data(data):
    my_data = load_svmlight_file(data)
    return my_data[0], my_data[1]
Exemplo n.º 52
0
def _get_X_y(dataset, multilabel, replace=False):
    """Load a LIBSVM dataset as sparse X and observation y/Y.
    If X and y already exists as npz and npy, they are not redownloaded unless
    replace=True."""

    # some files are compressed, some are not:
    if NAMES[dataset].endswith('.bz2'):
        stripped_name = NAMES[dataset][:-4]
    else:
        stripped_name = NAMES[dataset]

    ext = '.npz' if multilabel else '.npy'
    y_path = DATA_HOME / f"{stripped_name}_target{ext}"
    X_path = DATA_HOME / f"{stripped_name}_data"  # no ext to handle npy or npz
    if (replace or not y_path.exists()
            or not ((X_path.parent / (X_path.name + '.npy')).exists() or
                    (X_path.parent / (X_path.name + '.npz')).exists())):
        # above, do not use .with_suffix bc of datasets like a1a.t, where the
        # method would replace the .t by .npz
        tmp_path = DATA_HOME / stripped_name

        # Download the dataset
        source_path = DATA_HOME / NAMES[dataset]
        if not source_path.parent.exists():
            source_path.parent.mkdir(parents=True)
        download_libsvm(dataset, source_path, replace=replace)

        # decompress file only if it is compressed
        if NAMES[dataset].endswith('.bz2'):
            decompressor = BZ2Decompressor()
            print("Decompressing...")
            with open(tmp_path, "wb") as f, open(source_path, "rb") as g:
                for data in iter(lambda: g.read(100 * 1024), b''):
                    f.write(decompressor.decompress(data))
            source_path.unlink()

        n_features_total = N_FEATURES[dataset]

        print("Loading svmlight file...")
        with open(tmp_path, 'rb') as f:
            X, y = load_svmlight_file(f,
                                      n_features=n_features_total,
                                      multilabel=multilabel)

        tmp_path.unlink()
        # if X's density is more than 0.5, store it in dense format:
        if len(X.data) >= 0.5 * X.shape[0] * X.shape[1]:
            X = X.toarray(order='F')
            np.save(X_path, X)
        else:
            X = sparse.csc_matrix(X)
            X.sort_indices()
            sparse.save_npz(X_path, X)

        if multilabel:
            indices = np.array([lab for labels in y for lab in labels])
            indptr = np.cumsum([0] + [len(labels) for labels in y])
            data = np.ones_like(indices)
            Y = sparse.csr_matrix((data, indices, indptr))
            sparse.save_npz(y_path, Y)
            return X, Y

        else:
            np.save(y_path, y)

    else:
        try:
            X = sparse.load_npz(X_path.parent / (X_path.name + '.npz'))
        except FileNotFoundError:
            X = np.load(X_path.parent / (X_path.name + '.npy'))

        if multilabel:
            y = sparse.load_npz(y_path)
        else:
            y = np.load(y_path)

    return X, y
Exemplo n.º 53
0
import numpy as np
from sklearn.datasets import load_svmlight_file
V, y = load_svmlight_file('U1.lsvm', 100)
U, y = load_svmlight_file('M1.lsvm', 100)
U = np.array(U.todense())
V = np.array(V.todense())
np.savetxt('U.csv', U, fmt='%f', delimiter=',', newline='\n')
np.savetxt('V.csv', V, fmt='%f', delimiter=',', newline='\n')
def load_data(path):
    x_train, y_train = datasets.load_svmlight_file(path)
    x_train.todense()
    return x_train, y_train
def classification(test_size):
	'''Perform classification on each of the features, using each of Naive Bayes and Linear Regression.
	The test_size is given.
	'''
	test_features = [('./lab3_data/1000/train.tokens.svmlight', 'tokens', 0.1),
					('./lab3_data/1000/train.pos.svmlight', 'pos', 0.5),
					('./lab3_data/1000/train.ner.svmlight', 'ner', 0.5),
					('./lab3_data/1000/train.sent_length.svmlight', 'sent_length', 0.5)]
	all_acc, all_f1 = [], []
	# X_train_all, X_test_all, y_train_all, y_test_all = [], [], [], []
	for file, feature, alpha in test_features:
		X, y = load_svmlight_file(file)
		print(X.shape, y.shape)
		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

		#Stacking for training on all features
		# X_train_all, X_test_all, y_train_all, y_test_all = hstack( X_train_all, X_train ), hstack( X_test_all, X_test), hstack(y_train_all, y_train), hstack(y_test_all, y_test)


		#Classify and report on scores
		for classifier in ['nb', 'maxent']:
			clf = train_clf(X_train, y_train, classifier, alpha)
			y_pred = evaluate(clf, X_test)
			classifier_name = 'Naive Bayes' if classifier == 'nb' else 'Logistic Regression'
			print('='*53)
			print(classifier_name + ' on ' + feature)
			print('-'*53)


			acc = compute_score(y_test, y_pred, score='acc')
			f1 = compute_score(y_test, y_pred, score='f1', average='macro')
			
			all_acc.append(acc)
			all_f1.append(f1)

			print('acc = ', acc)
			target_names = ['author '+str(n) for n in range(20)]
			print(classification_report(y_test, y_pred, target_names=target_names))


	#Prepare a figure to display
	clf_types = ['NB tokens', 'LR tokens',
				'NB POS', 'LR POS',
				'NB NER', 'LR NER',
				'NB Size', 'LR Size']
	fig, ax = plt.subplots()
	index = np.arange(len(all_f1))
	bar_width = 0.35
	opacity = 0.8

	rects1 = plt.bar(index, all_acc, bar_width,
	alpha=opacity,
	color='b',
	label='Acc')
	 
	rects2 = plt.bar(index + bar_width, all_f1, bar_width,
	alpha=opacity,
	color='g',
	label='Macro-F1')
	 
	plt.xlabel('Clf type')
	plt.ylabel('Scores')
	plt.title('Scores by clf type')
	plt.xticks(index + bar_width, [str(m) for m in clf_types])
	plt.legend()
	plt.show()
Exemplo n.º 56
0
    split = split[:-1]
    for j, feature in enumerate(split):
        if j == 0:
            split[0] = int(split[0])
        else:
            split2 = feature.split(':')
            split[j] = float(split2[1]) 
    #print split
    Y_test_list.append(split[0])
    X_test_list.append(split[1:])
X_test = numpy.array(X_test_list)
Y_test = numpy.array(Y_test_list)
#print sum(numpy.isinf(X_train))

# Use load_svmlight_file
X_train, Y_train = load_svmlight_file("../feats/train_formatted.lsvm")
X_train = X_train.toarray()
X_test, Y_test = load_svmlight_file("../feats/test_formatted.lsvm")
X_test = X_test.toarray()
#print X_train

# LDA
clf = LDA()
clf.fit(X_train,Y_train)
qda_pred = clf.predict(X_test)
accuracy = sum(qda_pred == Y_test)/Y_test.size
print 'LDA Accuracy: ' + str(accuracy)

# QDA
clf = QDA()
clf.fit(X_train,Y_train)
Exemplo n.º 57
0
def test_load_libsvm():
    datasets = {
        "eurlex-4k": {
            "file": os.path.join(TEST_DATA_PATH, "Eurlex/eurlex_test.txt"),
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 5000,
                "offset": 1
            }
        },
        "amazonCat-13k": {
            "file": os.path.join(TEST_DATA_PATH,
                                 "AmazonCat/amazonCat_test.txt"),
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 203882,
                "offset": 1
            }
        },
        "amazonCat-14k": {
            "file":
            os.path.join(TEST_DATA_PATH,
                         "AmazonCat-14K/amazonCat-14K_test.txt"),
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 597540,
                "offset": 1
            }
        },
        "wiki10-31k": {
            "file": os.path.join(TEST_DATA_PATH, "Wiki10/wiki10_test.txt"),
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 101938,
                "offset": 1
            }
        }
    }

    for d, v in datasets.items():
        download_dataset(d, subset='test', format='bow', root=TEST_DATA_PATH)
        print("\n{} time comparison:".format(d))

        t_start = time()
        sk_X, sk_Y = load_svmlight_file(v["file"], **v["sklearn_args"])
        print(
            "\tsklearn.datasets.load_svmlight_file time: {}s".format(time() -
                                                                     t_start))

        t_start = time()
        nxc_X1, nxc_Y_list = load_libsvm_file(v["file"], labels_format="list")
        print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() -
                                                                      t_start))

        t_start = time()
        nxc_X2, nxc_Y_csrm = load_libsvm_file(v["file"],
                                              labels_format="csr_matrix")
        print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() -
                                                                      t_start))

        assert np.array_equal(nxc_X1.indptr, nxc_X2.indptr)
        assert np.array_equal(nxc_X1.indices, nxc_X2.indices)
        assert np.array_equal(nxc_X1.data, nxc_X2.data)

        assert np.array_equal(nxc_X1.indptr, sk_X.indptr)
        assert np.array_equal(nxc_X1.indices, sk_X.indices)
        assert np.allclose(nxc_X1.data, sk_X.data)
        assert nxc_X1.shape[0] == nxc_Y_csrm.shape[0]

        assert len(nxc_Y_list) == len(sk_Y)
        for nxc_y, sk_y in zip(nxc_Y_list, sk_Y):
            assert len(nxc_y) == len(sk_y)
            assert all(y1 == y2 for y1, y2 in zip(nxc_y, sk_y))
        for i in range(1,len(data.columns)):
            plt.plot(data[data.columns[0]],data[data.columns[i]])
    else:
        x = range(len(data))
        plt.xticks(x,data[data.columns[0]],rotation='vertical')
        for i in range(1,len(data.columns)):
            plt.plot(x,data[data.columns[i]])
        
    plt.legend(data.columns[1:], loc='upper left')
    plt.xlabel(data.columns[0])
    plt.ylabel('Accuracy')
    plt.title('Accuracy plot for ' + fileName)
    plt.show()
#===================================Main =======================================
file ='vision_cuboids_histogram.txt.gz'
X_train, y_train = load_svmlight_file(gzip.open(path+"train\\"+file))
X_test, y_test = load_svmlight_file(gzip.open(path+"test\\"+file))
X_val, y_val = load_svmlight_file(gzip.open(path+"validation\\"+file))
X_train = X_train[y_train!=31]
X_test = X_test[y_test!=31]
X_val = X_val[y_val!=31]
y_train = y_train[y_train!=31]
y_test = y_test[y_test!=31]
y_val = y_val[y_val!=31]    

tech = 'LinearSVC'
C=0.5
X_train_new, X_test_new , X_val_new = featureSelection(X_train,X_test,X_val,y_train, log=True,tech = tech,C=C)

data_df = pd.DataFrame()
n_guass =2
Exemplo n.º 59
0
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.cross_validation import KFold
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib import pyplot as plt

data, target = load_svmlight_file('data/E2006.train')

# 다음을 변경한다
# from sklearn.linear_model import Lasso
# met = Lasso(alpha=0.1)
met = ElasticNet(alpha=0.1)

kf = KFold(len(target), n_folds=5)
pred = np.zeros_like(target)
for train, test in kf:
    met.fit(data[train], target[train])
    pred[test] = met.predict(data[test])

print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
print('')
Exemplo n.º 60
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
import time

cancer = load_svmlight_file('data/breast-cancer')
cancer_X = cancer[0].toarray()
cancer_y = cancer[1]
dna = load_svmlight_file('data/dna')
dna_X = dna[0].toarray()
dna_y = dna[1]

params_svm = {
    'scale': [True, False],
    'test_size': [0.1, 0.2, 0.3, 0.4, 0.5],
    'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'cancer_dim': [2, 3, 5, 8, 10],
    'dna_dim': [2, 5, 10, 20, 50, 100, 150, 180]
}

params_mlp = {
    'scale': [True, False],
    'test_size': [0.1, 0.2, 0.3, 0.4, 0.5],
    'layers': [(10,), (100,), (10, 10), (100, 100), (200, 200), (100, 200, 100)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],