def subsample_to_file(svm_file, out_dir, out_name, multilabel=False, row_ratio=0.5, col_ratio=0.3, random_state=12): """ Example: '''python # run the following command in the current directory will create a # `tmp` folder, if not already exists, and generate a file called # `a9a_sub` from the original file `./data/a9a`. Both files are # in libsvm format. subsample_to_file("./data/a9a", "./tmp", "a9a_sub") # read the subsampled file and make sure its number of rows is half of # that of a9a and its number of column is roughly third of a9a (123) X, y = load_svmlight_file('./tmp/a9a_sub') assert X.shape == (16280, 36) ''' """ assert 1 >= row_ratio > 0, \ "Row ratio {row_ratio} must be (0, 1]" \ .format(**locals()) assert 1 >= col_ratio > 0, \ "Col ratio {col_ratio} must be (0, 1]" \ .format(**locals()) X, y = load_svmlight_file(svm_file, multilabel=multilabel) n, m = X.shape subn = int(n*row_ratio) subm = int(m*col_ratio) rst = np.random.RandomState(random_state) ridx = rst.choice(n, subn, replace=False) cidx = rst.choice(m, subm, replace=False) mkdir_p(out_dir) out_file = os.path.join(out_dir, out_name) dump_svmlight_file(X[ridx,:][:,cidx], y[ridx], out_file, multilabel=multilabel)
def save_libfm(X_sprs_mat, y_array, f): print("Save LibFM Format") dump_svmlight_file(X_sprs_mat, y_array, f) return
def test_load_with_offsets(sparsity, n_samples, n_features): rng = np.random.RandomState(0) X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features)) if sparsity: X[X < sparsity] = 0.0 X = sp.csr_matrix(X) y = rng.randint(low=0, high=2, size=n_samples) f = BytesIO() dump_svmlight_file(X, y, f) f.seek(0) size = len(f.getvalue()) # put some marks that are likely to happen anywhere in a row mark_0 = 0 mark_1 = size // 3 length_0 = mark_1 - mark_0 mark_2 = 4 * size // 5 length_1 = mark_2 - mark_1 # load the original sparse matrix into 3 independent CSR matrices X_0, y_0 = load_svmlight_file(f, n_features=n_features, offset=mark_0, length=length_0) X_1, y_1 = load_svmlight_file(f, n_features=n_features, offset=mark_1, length=length_1) X_2, y_2 = load_svmlight_file(f, n_features=n_features, offset=mark_2) y_concat = np.concatenate([y_0, y_1, y_2]) X_concat = sp.vstack([X_0, X_1, X_2]) assert_array_almost_equal(y, y_concat) assert_array_almost_equal(X.toarray(), X_concat.toarray())
def test_dump(): Xs, y = load_svmlight_file(datafile) Xd = Xs.toarray() for X in (Xs, Xd): for zero_based in (True, False): for dtype in [np.float32, np.float64]: f = BytesIO() dump_svmlight_file(X.astype(dtype), y, f, zero_based=zero_based) f.seek(0) comment = f.readline() assert_in("scikit-learn %s" % sklearn.__version__, comment) comment = f.readline() assert_in(["one", "zero"][zero_based] + "-based", comment) X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert_equal(X2.dtype, dtype) if dtype == np.float32: assert_array_almost_equal( # allow a rounding error at the last decimal place Xd.astype(dtype), X2.toarray(), 4, ) else: assert_array_almost_equal( # allow a rounding error at the last decimal place Xd.astype(dtype), X2.toarray(), 15, ) assert_array_equal(y, y2)
def batch_fit(self, Xs, ys, dump=True): qids = [np.array([i] * len(ys[i])) for i in range(len(ys))] print "dumping data to Xtrain.data" if dump: dump_svmlight_file( np.concatenate(Xs), np.concatenate(ys), "Xtrain.data", zero_based=False, query_id=np.concatenate(qids) ) print "now learning" print call( [ self.path + "svm_hmm_learn", "-c", "%d" % self.C, "--t", "%d" % self.t, "--e", "%d" % self.e, "Xtrain.data", "svmhmm-model.dat", ] ) return self
def fit(self, X, Y): self.labels=list(set(Y)) if len(self.labels) > 2 : self.multiclass=True #print 'multiclass' else: self.multiclass=False self.train_fname =self.base_str +'-svmcmd-train' + '.dat' self.model_fname =self.train_fname + '.model' dump_svmlight_file(X,Y,self.train_fname ,zero_based=False) if self.multiclass: command_line=path_to_train_program+'gtsvm_initialize {0} -f {1} -o {2} -m 1 '.format(self.param_str, self.train_fname , self.model_fname ) else: command_line=path_to_train_program+'gtsvm_initialize -f {1} -o {2} {0}'.format(self.param_str, self.train_fname , self.model_fname ) args = shlex.split(command_line) p = subprocess.Popen(args) p.wait() command_line=path_to_train_program+'gtsvm_optimize -i {0} -o {1} -e {2} -n {3}'.format(self.model_fname,self.model_fname,self.tol,self.max_iter) args = shlex.split(command_line) p = subprocess.Popen(args,stderr=subprocess.PIPE) p.wait() opt_err_str=p.stderr.read() ##gtsvm is too buggy if len(opt_err_str) < 1: command_line=path_to_train_program+'gtsvm_shrink -i {0} -o {1}'.format(self.model_fname,self.model_fname) args = shlex.split(command_line) p = subprocess.Popen(args) p.wait() self.train_fail=False else : self.train_fail=True return self
def predict(self, X): if isinstance(X,list): self.test_n_sample=len(X) else: self.test_n_sample=X.shape[0] Y=[1]*self.test_n_sample self.test_fname =self.base_str +'-svmcmd-test' + '.dat' self.predict_fname =self.base_str +'-svmcmd-predict' + '.dat' dump_svmlight_file(X,Y,self.test_fname ,zero_based=False) command_line=path_to_train_program+'gtsvm_classify -f {0} -i {1} -o {2}'.format(self.test_fname , self.model_fname, self.predict_fname ) args = shlex.split(command_line) p = subprocess.Popen(args) p.wait() if self.train_fail: return [max(self.labels)+1]*self.test_n_sample if self.multiclass : f = open(self.predict_fname, 'rb') self.predicted_weight = map(lambda row: map(float,row), list(csv.reader(f))) f.close() Y_predict=map(np.argmax, self.predicted_weight) else : self.predicted_weight = np.loadtxt( self.predict_fname) Y_predict=map(int,map(round,self.predicted_weight)) return Y_predict
def generate_weekday_newbuyer_exposure(df): """ 加入新客数,曝光数 """ X = df[['uv_0612_0618', 'uv_weekday', 'uv_weekend', 'no_subsidy_exposure', 'newbuyer_6_18']] y = df.uv_0626_0702 dump_svmlight_file(X, y, './uv_weekday_weekend_newbuyer_exposure_without_outliers.dat')
def generate_week(df): """ 生成1维特征 """ X = df[['uv_0612_0618']] y = df.uv_0626_0702 dump_svmlight_file(X, y, './uv_week.dat')
def executa_extracao_n(base_treino, metodo, n=1): inicio = time() lista_imagens = arq.busca_arquivos(base_treino, "*.png") n_imgs_treino = len(lista_imagens) for lado in range(8,n+1,4): atributos = [] rotulos = [] arq_treino = base_treino + "base_PFTAS_"+str(lado)+"x"+str(lado)+".svm" ## INICIO DO PROCESSO DE EXTRACAO DE ATRIBUTOS for arq_imagem in lista_imagens: print("Arquivo: " + arq_imagem) imagem = mh.imread(arq_imagem) if (imagem != None): classe, _ = ex.classe_arquivo(arq_imagem) print("executa_extracao_n - shape imagem:" + str(imagem.shape)) # Extrai os atributos e gera os arquivos dos patches da base de treino atrs,rots = extrai_pftas_patches_n(imagem, classe, lado) atributos += atrs rotulos += rots dump_svmlight_file(atributos, rotulos, arq_treino) log("Extraidos atributos da base " + base_treino + " utilizando " + metodo + "\n para " + str(n_imgs_treino) + "imagens") # Exibe o tempo de execução log(str(time()-inicio) + "EXTRAÇÃO")
def generate_weekday_weekend(df): """ 生成3维特征 """ X = df[['uv_0612_0618', 'uv_weekday', 'uv_weekend']] y = df.uv_0626_0702 dump_svmlight_file(X, y, './uv_weekday_weekend.dat')
def test_dump(): Xs, y = load_svmlight_file(datafile) Xd = Xs.toarray() for X in (Xs, Xd): for zero_based in (True, False): for dtype in [np.float32, np.float64]: f = BytesIO() # we need to pass a comment to get the version info in; # LibSVM doesn't grok comments so they're not put in by # default anymore. dump_svmlight_file(X.astype(dtype), y, f, comment="test", zero_based=zero_based) f.seek(0) comment = f.readline() assert_in("scikit-learn %s" % sklearn.__version__, comment) comment = f.readline() assert_in(["one", "zero"][zero_based] + "-based", comment) X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert_equal(X2.dtype, dtype) if dtype == np.float32: assert_array_almost_equal( # allow a rounding error at the last decimal place Xd.astype(dtype), X2.toarray(), 4) else: assert_array_almost_equal( # allow a rounding error at the last decimal place Xd.astype(dtype), X2.toarray(), 15) assert_array_equal(y, y2)
def save_all_data_in_svmlight_format(self, file_path, extraction_method, label_type): label_list, feature_vector_list = self.extract_all_data(extraction_method, label_type) with open(file_path, 'wb') as f: datasets.dump_svmlight_file(feature_vector_list, label_list, f)
def dump_svmlight(X_matrix, Y, feature_names, output_filename, feature_id_offset = 0): dump_svmlight_file(X_matrix, Y, output_filename) contents = None with open(output_filename) as output_file: contents = '#' + ' '.join(feature_names) + '\n' + ''.join(output_file.readlines()) with open(output_filename, 'w') as output_file: output_file.write(contents)
def data_dump(self, f, X_train, X_test, y_train, y_test): from sklearn.datasets import dump_svmlight_file ddd = dict() new_y_train = [] last = 0 for yy in y_train: if yy in ddd: yy = (ddd[yy]) else: ddd[yy] = last yy = last last += 1 new_y_train.append(yy) dump_svmlight_file(X_train, new_y_train, f + ".svmlight.train") new_y_test = [] for yy in y_test: if yy in ddd: yy = (ddd[yy]) else: ddd[yy] = last yy = last last += 1 new_y_test.append(yy) dump_svmlight_file(X_test, new_y_test, f + ".svmlight.test")
def load_training_data(file_location=str, load_from_database=False, limit=int(1000), clean_dataset=True): """ If ```load_from_database``` is True, retrieves and stores data from database to file. Arguments: file_location (str): Path + filename of libsvm file to save/load (e.g. 'training_data') load_from_database (bool): Should data be retrieved from database? limit (int): Amount of records to retrieve from database (default=1000) clean_dataset (bool): Should questions be cleaned (e.g. remove code samples, hexadecimals, numbers, etc)? Returns: (pandas.DataFrame.from_csv, sklearn.datasets.load_svmlight_file): Tuple containing a pandas.DataFrame (all data retrieved from database) and tuple with training data (load_svmlight_file) See: | ```MySQLDatabase().retrieve_training_data``` | ```pandas.DataFrame.to_csv``` | ```pandas.DataFrame.from_csv``` | ```sklearn.datasets.dump_svmlight_file``` | ```sklearn.datasets.load_svmlight_file``` """ svm_file = file_location + ".dat" csv_file = file_location + ".csv" if load_from_database: comment = u"label: (-1: Bad question, +1: Good question); features: (term_id, frequency)" MySQLDatabase().set_vote_value_params() data = MySQLDatabase().retrieve_training_data(limit, clean_dataset) # create a term-document matrix vectorizer = CountVectorizer(analyzer='word', min_df=0.01, stop_words="english") td_matrix = vectorizer.fit_transform(data.get(QUESTION_TEXT_KEY)) data.to_csv(csv_file) dump_svmlight_file(td_matrix, data[CLASS_LABEL_KEY], f=svm_file, comment=comment) return DataFrame.from_csv(csv_file), load_svmlight_file(svm_file)
def test_dump_comment(): X, y = load_svmlight_file(datafile) X = X.toarray() f = BytesIO() ascii_comment = "This is a comment\nspanning multiple lines." dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False) f.seek(0) X2, y2 = load_svmlight_file(f, zero_based=False) assert_array_almost_equal(X, X2.toarray()) assert_array_equal(y, y2) # XXX we have to update this to support Python 3.x utf8_comment = "It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc" f = BytesIO() assert_raises(UnicodeDecodeError, dump_svmlight_file, X, y, f, comment=utf8_comment) unicode_comment = utf8_comment.decode("utf-8") f = BytesIO() dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False) f.seek(0) X2, y2 = load_svmlight_file(f, zero_based=False) assert_array_almost_equal(X, X2.toarray()) assert_array_equal(y, y2) f = BytesIO() assert_raises(ValueError, dump_svmlight_file, X, y, f, comment="I've got a \0.")
def test_load_with_long_qid(): # load svmfile with longint qid attribute data = b(""" 1 qid:0 0:1 1:2 2:3 0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985 0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985 3 qid:9223372036854775807 0:1440446648 1:72048431380967004 2:236784985""") X, y, qid = load_svmlight_file(BytesIO(data), query_id=True) true_X = [[1, 2, 3], [1440446648, 72048431380967004, 236784985], [1440446648, 72048431380967004, 236784985], [1440446648, 72048431380967004, 236784985]] true_y = [1, 0, 0, 3] trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807] assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) assert_array_equal(qid, trueQID) f = BytesIO() dump_svmlight_file(X, y, f, query_id=qid, zero_based=True) f.seek(0) X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True) assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) assert_array_equal(qid, trueQID) f.seek(0) X, y = load_svmlight_file(f, query_id=False, zero_based=True) assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X)
def test_dump_concise(): one = 1 two = 2.1 three = 3.01 exact = 1.000000000000001 # loses the last decimal place almost = 1.0000000000000001 X = [[one, two, three, exact, almost], [1e9, 2e18, 3e27, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] y = [one, two, three, exact, almost] f = BytesIO() dump_svmlight_file(X, y, f) f.seek(0) # make sure it's using the most concise format possible assert_equal(f.readline(), b("1 0:1 1:2.1 2:3.01 3:1.000000000000001 4:1\n")) assert_equal(f.readline(), b("2.1 0:1000000000 1:2e+18 2:3e+27\n")) assert_equal(f.readline(), b("3.01 \n")) assert_equal(f.readline(), b("1.000000000000001 \n")) assert_equal(f.readline(), b("1 \n")) f.seek(0) # make sure it's correct too :) X2, y2 = load_svmlight_file(f) assert_array_almost_equal(X, X2.toarray()) assert_array_equal(y, y2)
def create_train_test(n_samples, doc2vec, save_svmlight=True): print "Creating train & test sets..." # Create labelled data arrays. data = np.zeros((n_samples, doc2vec.size)) labels = np.zeros(n_samples) for i in range(n_samples / 2): prefix_train_pos = 'TRAIN_POS_' + str(i) prefix_train_neg = 'TRAIN_NEG_' + str(i) data[i] = doc2vec.model.docvecs[prefix_train_pos] data[n_samples / 2 + i] = doc2vec.model.docvecs[prefix_train_neg] labels[i] = 1 # Split in train and validation arrays. train, test, train_labels, test_labels = train_test_split( data, labels, test_size=0.3, random_state=42) if save_svmlight: current_path = os.path.abspath( os.path.join(os.getcwd(), os.pardir)) dump_svmlight_file(train, train_labels, current_path + "/Data/Processed/TrainSet.svm") dump_svmlight_file(test, test_labels, current_path + "/Data/Processed/TestSet.svm") return train, test, train_labels, test_labels
def pair_vectors(pairs, features, words, output_path): vectorizer = DictVectorizer() vectors = vectorizer.fit_transform(x[1] for x in features) vector_map = {word:vector for word, vector in itertools.izip((x[0].split('/')[0] for x in features), vectors)} # Positive examples positive = [] record = [] for specific, general in pairs: positive.append(vector_map[general] - vector_map[specific]) record.append( (specific, general, 1) ) pair_set = set([tuple(x) for x in pairs]) non_positive = [] for i in range(len(positive)): first = second = None while first == second or (first, second) in pair_set: first = words[random.randint(len(words))] second = words[random.randint(len(words))] non_positive.append(vector_map[second] - vector_map[first]) record.append( (first, second, 0) ) data = vstack(positive + non_positive) target = [1]*len(positive) + [0]*len(non_positive) # Save dataset with open(os.path.join(output_path,'wn-noun-dependencies.mat'), 'wb') as data_file: dump_svmlight_file(data, target, data_file) with open(os.path.join(output_path,'wn-noun-dependencies.json'), 'w') as record_file: json.dump(record, record_file)
def predict(self,X) : # write test file to specific format dump_svmlight_file(X,np.zeros(X.shape[0]),self._test_file_name,zero_based=True) # call _exectute_prediction self._execute_prediction(self._test_file_name) # import the output of the script and return it predictions = pd.read_csv(self._temp_pred_file_name,header = None) return predictions
def download_mnist(): training, test = fetch_mnist(data_home=dataset_dir) X, y = training datasets.dump_svmlight_file(X, y, join(dataset_dir, "mnist")) X, y = test datasets.dump_svmlight_file(X, y, join(dataset_dir, "mnist.t"))
def save_svmlight(x, y, path): LOG.debug("saving svmlight to %s", path) ensure_exist(path) _, n = x.shape with open(os.path.join(path, "nfeature.txt"), "wb") as f: f.write(str(n)) dump_svmlight_file(x, y, os.path.join(path, "data"), zero_based=False)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_iter=100, dim=4, lrate=.1, n_fold=5): feature_name = os.path.basename(train_file)[:-8] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='libfm_{}_{}_{}_{}.log'.format( n_iter, dim, lrate, feature_name )) logging.info('Loading training data') X, y = load_svmlight_file(train_file) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) logging.info('Cross validation...') p = np.zeros_like(y) lloss = 0. for i_trn, i_val in cv: now = datetime.now().strftime('%Y%m%d-%H%M%S') valid_train_file = '/tmp/libfm_train_{}_{}.sps'.format(feature_name, now) valid_test_file = '/tmp/libfm_valid_{}_{}.sps'.format(feature_name, now) valid_predict_file = '/tmp/libfm_predict_{}_{}.sps'.format(feature_name, now) dump_svmlight_file(X[i_trn], y[i_trn], valid_train_file, zero_based=False) dump_svmlight_file(X[i_val], y[i_val], valid_test_file, zero_based=False) subprocess.call(["libFM", "-task", "c", '-dim', '1,1,{}'.format(dim), '-init_stdev', str(lrate), '-iter', str(n_iter), '-train', valid_train_file, '-test', valid_test_file, '-out', valid_predict_file]) p[i_val] = np.loadtxt(valid_predict_file) lloss += log_loss(y[i_val], p[i_val]) os.remove(valid_train_file) os.remove(valid_test_file) os.remove(valid_predict_file) logging.info('Log Loss = {:.4f}'.format(lloss / n_fold)) np.savetxt(predict_valid_file, p, fmt='%.6f') logging.info('Retraining with 100% data...') subprocess.call(["libFM", "-task", "c", '-dim', '1,1,{}'.format(dim), '-init_stdev', str(lrate), '-iter', str(n_iter), '-train', train_file, '-test', test_file, '-out', predict_test_file])
def get_train_and_test_spaese_matrix(): Y = pd.read_csv('data/train_Y.csv', index_col='user_id')['type'] train_X = combine_all_behavior() # dump_svmlight_file(train_X,Y,'data/train_metrix') dump_svmlight_file(train_X,Y,'data/train_metrix_3') test_X = combine_all_behavior(is_train=False) test_Y = [0]*(test_X.shape[0]) # dump_svmlight_file(test_X,test_Y,'data/test_metrix') dump_svmlight_file(test_X,test_Y,'data/test_metrix_3')
def test_dump(): X_sparse, y_dense = load_svmlight_file(datafile) X_dense = X_sparse.toarray() y_sparse = sp.csr_matrix(y_dense) # slicing a csr_matrix can unsort its .indices, so test that we sort # those correctly X_sliced = X_sparse[np.arange(X_sparse.shape[0])] y_sliced = y_sparse[np.arange(y_sparse.shape[0])] for X in (X_sparse, X_dense, X_sliced): for y in (y_sparse, y_dense, y_sliced): for zero_based in (True, False): for dtype in [np.float32, np.float64, np.int32]: f = BytesIO() # we need to pass a comment to get the version info in; # LibSVM doesn't grok comments so they're not put in by # default anymore. if (sp.issparse(y) and y.shape[0] == 1): # make sure y's shape is: (n_samples, n_labels) # when it is sparse y = y.T dump_svmlight_file(X.astype(dtype), y, f, comment="test", zero_based=zero_based) f.seek(0) comment = f.readline() comment = str(comment, "utf-8") assert_in("scikit-learn %s" % sklearn.__version__, comment) comment = f.readline() comment = str(comment, "utf-8") assert_in(["one", "zero"][zero_based] + "-based", comment) X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert_equal(X2.dtype, dtype) assert_array_equal(X2.sorted_indices().indices, X2.indices) X2_dense = X2.toarray() if dtype == np.float32: # allow a rounding error at the last decimal place assert_array_almost_equal( X_dense.astype(dtype), X2_dense, 4) assert_array_almost_equal( y_dense.astype(dtype), y2, 4) else: # allow a rounding error at the last decimal place assert_array_almost_equal( X_dense.astype(dtype), X2_dense, 15) assert_array_almost_equal( y_dense.astype(dtype), y2, 15)
def project(file_name, dimensions): data = load_svmlight_file(file_name) projector = SparseRandomProjection(dimensions, 1/3.0, dense_output=True) projected = projector.fit_transform(data[0]) new_file_name = file_name[:-4] + '-' + str(dimensions) + '.mat' new_file = open(new_file_name, 'wb') dump_svmlight_file(projected, data[1], new_file)
def save_output(self, X, epoch=None): # write output to file if epoch is not None: fname_out = self.conf.fname_out.replace('%e', str(epoch).zfill(5)) else: fname_out = self.conf.fname_out.replace('%e', 'final') if self.conf.verbosity > 1: print "Saving output to", fname_out, "..." dump_svmlight_file(X, self.tl, fname_out)
def glimpse_to_svmlight(input_file, train_file, test_file): with open(input_file) as fh: exp = pickle.load(fh) ftrs = ExtractFeatures(Layer.C2, exp.extractor.activation) trng = GetTrainingSet(exp) dump_svmlight_file(ftrs[trng], exp.corpus.labels[trng] + 1, train_file, zero_based=False) dump_svmlight_file(ftrs[~trng], exp.corpus.labels[~trng] + 1, test_file, zero_based=False) print "Categories" print "----------" print "\n".join("%d - %s" % (index+1,name) for (index,name) in enumerate(exp.corpus.class_names))
folds = preprocess.create_folds(X, y, queries, 5) fold_number = 1 C_array = [0.1, 0.01, 0.001] # model_handler = mh.models_handler(C_array) validated = set() scores = {} models = {} for train, test in folds: evaluator.empty_validation_files() validated, validation_set, train_set = preprocess.create_validation_set( 5, validated, set(train), number_of_queries, queries) train_file = "train" + str(fold_number) + ".txt" run_command("rm " + train_file) dump_svmlight_file(X[train], y[train], train_file, query_id=queries[train], zero_based=False) for C in C_array: model_file = learn_svm(C, train_file, fold_number) weights = recover_model(model_file) svm = s.svm_sgd(C) svm.w = weights score_file = svm.predict(X, queries, validation_set, evaluator, True) score = evaluator.run_trec_eval(score_file, qrels_file) scores[svm.C] = score models[svm.C] = svm max_C = max(scores.items(), key=operator.itemgetter(1))[0] chosen_model = models[max_C] chosen_model.predict(X, queries, test, evaluator)
def random(data, d_sub, name, what, array, R, n_label): np.set_printoptions(threshold=np.inf, suppress=True) data = scipy.sparse.coo_matrix.tocsr(data) label = data[:, 0] data = sklearn.preprocessing.maxabs_scale(data[:, 1:]) n_sample, d_feature = np.shape(data) a = math.sqrt(float(d_feature / d_sub)) * data # # pca = PCA(n_components=d_sub).fit_transform(a) # # label = (np.asarray(label)).reshape(-1) # dump_svmlight_file(pca, label, 'other_method/kmeans_linear/%s/%s_pca' % (name, what), zero_based=False) train2 = a[:, R] dump_svmlight_file(train2, label, 'other_method/kmeans_linear/%s/%s_random' % (name, what), zero_based=False) # func = KMeans(n_clusters=d_sub) # transform = a.T # kmeans = func.fit(transform) # klabel = kmeans.labels_ # R_means = [] # for i in range(d_sub): # temp1 = [] # temp2 = [] # for idx, val in enumerate(klabel): # if val == i: # temp1.append(idx) # distance = func.transform(transform[idx]) # temp2.append(distance[0, val]) # # idx_temp = temp2.index(min(temp2)) # R_means.append(temp1[idx_temp]) # R_means = sorted(R_means) # train2 = a[:,R_means] # label = (np.asarray(label)).reshape(-1) # dump_svmlight_file(train2, label, 'other_method/kmeans_linear/%s/%s_means' % (name, what), # zero_based=False) # b = a # transform = b.T # var = np.var(transform, axis=1) # topn = np.argsort(var, axis=0) # topn = np.reshape(topn, (1, -1)) # topn = np.asarray(topn) # R_variace = topn[0, d_feature - d_sub:] # R_variace = sorted(R_variace) # R_variace = np.asarray(R_variace) # train2 = a[:,R_variace] # label = (np.asarray(label)).reshape(-1) # dump_svmlight_file(train2, label, 'other_method/kmeans_linear/%s/%s_variance' % (name, what), # zero_based=False) b = a[:n_label] transform = b.T norm = linalg.norm(transform, axis=1) # print(norm) topn = np.argsort(norm, axis=0) # print(topn) topn = np.reshape(topn, (1, -1)) topn = np.asarray(topn) R_norm = topn[0, d_feature - d_sub:] R_norm = sorted(R_norm) R_norm = np.asarray(R_norm) train2 = a[:, R_norm] # exit() dump_svmlight_file(train2, label, 'other_method/kmeans_linear/%s/%s_norm' % (name, what), zero_based=False)
def save_as_svmlight(M, yid, fname): dump_svmlight_file(M[:, [x for x in range(M.shape[1]) if x != yid]], M[:, yid], fname)
dev_text_token_list = [] for text in dev_review_list: dev_text_token_list.append(Counter(set(text))) test_text_token_list = [] for text in test_review_list: test_text_token_list.append(Counter(set(text))) print('Save Data') #Save csr_matrix data into libsvm train_review_matrix = review_to_csr_matrix(train_text_token_list, feature_list, feature_num) train_stars_matrix = stars_to_csr_matrix(train_stars_list, class_num) dump_svmlight_file(train_review_matrix, train_stars_matrix, str(method + '_train.libsvm'), multilabel=True) print('Data saved: {}'.format(str(method + '_train.libsvm'))) eval_review_matrix = review_to_csr_matrix(eval_text_token_list, feature_list, feature_num) eval_stars_matrix = stars_to_csr_matrix(eval_stars_list, class_num) dump_svmlight_file(eval_review_matrix, eval_stars_matrix, str(method + '_eval.libsvm'), multilabel=True) print('Data saved: {}'.format(str(method + '_eval.libsvm'))) dev_review_matrix = review_to_csr_matrix(dev_text_token_list, feature_list, feature_num) dev_stars_matrix = stars_to_csr_matrix(dev_stars_list, class_num)
# -*- coding: utf-8 -*- """ Created on Sun Sep 9 00:18:22 2018 @author: Nitin """ """ from sklearn.datasets import load_svmlight_file, dump_svmlight_file file = "../data/usps/usps-train.dat" X,y = load_svmlight_file(file) dump_svmlight_file(X, y, file, zero_based=True, comment=None, query_id=None, multilabel=False) file = "../data/usps/usps-test.dat" X,y = load_svmlight_file(file) dump_svmlight_file(X, y, file, zero_based=True, comment=None, query_id=None, multilabel=False) for i in range(10): file = "../data/usps/t_{}.dat".format(i) X,y = load_svmlight_file(file) dump_svmlight_file(X, y, file, zero_based=True, comment=None, query_id=None, multilabel=False) """ # -*- coding: utf-8 -*- """ Created on Fri Feb 23 23:20:58 2018 File to load and process the USPS data into a binary classification format. @author: Nitin """ import numpy as np
#!/usr/bin/python # -*- coding: utf-8 -*- from sklearn.datasets import dump_svmlight_file import pandas as pd import numpy as np df = pd.read_csv("transfered.csv") y_data = np.array(df['is_malware']) del df['is_malware'] del df['sha1'] X_data = np.array(df) dump_svmlight_file(X_data, y_data, 'apk_libsvm.dat', zero_based=False, multilabel=False)
model_file = os.path.join(data_path, "sofml.model") training_file = os.path.join(data_path, "train_data.dat") training2_file = os.path.join(data_path, "train2_data.dat") test_file = os.path.join(data_path, "test_data.dat") pred_train2_file = os.path.join(data_path, "pred_train2.csv") pred_test_file = os.path.join(data_path, "pred_test.csv") # write out traindata and testdata to svmlight format print "writing out files" ntrain1_label = train1_label.copy() ntrain1_label.values[np.where(ntrain1_label == 0)] = -1 ntrain2_label = train2_label.copy() ntrain2_label.values[np.where(ntrain2_label == 0)] = -1 dump_svmlight_file(train1_data_norm, ntrain1_label, training_file, zero_based=False) dump_svmlight_file(train2_data_norm, ntrain2_label, training2_file, zero_based=False) dump_svmlight_file(test_data_norm, np.zeros((test_data_norm.shape[0], )), test_file, zero_based=False) # train #print "training sofia" call( sofiaml_path + " --learner_type sgd-svm --loop_type roc --prediction_type logistic --iterations 200000 --lambda 10000 --training_file "
def generate_feature(train_file, test_file, object_file, train_feature_file, test_feature_file): logging.info('loading input data') trn = pd.read_csv(train_file) tst = pd.read_csv(test_file) obj = pd.read_csv(object_file, header=None) obj.columns = ['course_id', 'object', 'category', 'children', 'start'] n_trn = trn.shape[0] trn.time = pd.to_datetime(trn.time) tst.time = pd.to_datetime(tst.time) df = pd.concat([trn, tst], axis=0) # get last dates of courses last_date = df[['course_id', 'time']].groupby('course_id', as_index=False).max() last_date.columns = ['course_id', 'last_date'] # extract object information obj.children.fillna('', inplace=True) obj['n_children'] = obj.children.apply( lambda x: int(np.log2(1 + len(x.split())))) obj.start.replace('null', '1999-01-01 00:00:00', inplace=True) obj.start = pd.to_datetime(obj.start) obj = pd.merge(obj, last_date, on='course_id', how='left') obj['obj_days_before_last_date'] = ( obj.last_date - obj.start).apply(lambda x: pd.Timedelta(x).days) obj.ix[obj.obj_days_before_last_date > 30, 'obj_days_before_last_date'] = 30 # merge log data with last coursedate and object information df = pd.merge(df, last_date, on='course_id', how='left') df = pd.merge( df, obj[['object', 'category', 'n_children', 'obj_days_before_last_date']], on='object', how='left') df['days_before_last_date'] = ( df.last_date - df.time).apply(lambda x: pd.Timedelta(x).days) df['weeks_before_last_date'] = df.days_before_last_date // 7 df.ix[df.weeks_before_last_date == 4, 'weeks_before_last_date'] = 3 df['last_month'] = df.last_date.apply(lambda x: x.month) df['obj_10_days_after_last_date'] = df.obj_days_before_last_date.apply( lambda x: 1 if x < 0 and x >= -10 else 0) df.obj_days_before_last_date = df.obj_days_before_last_date.apply( lambda x: np.sign(x) * int(np.log2(1 + np.sign(x) * x)) \ if ~pd.isnull(x) else x ) df.drop(['time', 'last_date'], axis=1, inplace=True) df.set_index('enrollment_id', inplace=True) X = encode_categorical_features(df, n=n_trn, min_obs=100, nan_as_var=True) X = X.tocsr() dump_svmlight_file(X[:n_trn], trn.enrollment_id.values, train_feature_file, zero_based=False) dump_svmlight_file(X[n_trn:], tst.enrollment_id.values, test_feature_file, zero_based=False)
def dump_data(x, y, file_output): datasets.dump_svmlight_file(x, y, file_output) os.remove("%s_tmp" % file_output)
def test(self): X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2) train_data = lgb.Dataset(X_train, label=y_train) valid_data = train_data.create_valid(X_test, label=y_test) params = { "objective": "binary", "metric": "auc", "min_data": 10, "num_leaves": 15, "verbose": -1, "num_threads": 1, "max_bin": 255, "gpu_use_dp": True } bst = lgb.Booster(params, train_data) bst.add_valid(valid_data, "valid_1") for i in range(20): bst.update() if i % 10 == 0: print(bst.eval_train(), bst.eval_valid()) self.assertEqual(bst.current_iteration(), 20) self.assertEqual(bst.num_trees(), 20) self.assertEqual(bst.num_model_per_iteration(), 1) self.assertAlmostEqual(bst.lower_bound(), -2.9040190126976606) self.assertAlmostEqual(bst.upper_bound(), 3.3182142872462883) bst.save_model("model.txt") pred_from_matr = bst.predict(X_test) with tempfile.NamedTemporaryFile() as f: tname = f.name with open(tname, "w+b") as f: dump_svmlight_file(X_test, y_test, f) pred_from_file = bst.predict(tname) os.remove(tname) np.testing.assert_allclose(pred_from_matr, pred_from_file) # check saved model persistence bst = lgb.Booster(params, model_file="model.txt") os.remove("model.txt") pred_from_model_file = bst.predict(X_test) # we need to check the consistency of model file here, so test for exact equal np.testing.assert_array_equal(pred_from_matr, pred_from_model_file) # check early stopping is working. Make it stop very early, so the scores should be very close to zero pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5} pred_early_stopping = bst.predict(X_test, **pred_parameter) # scores likely to be different, but prediction should still be the same np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping)) # test that shape is checked during prediction bad_X_test = X_test[:, 1:] bad_shape_error_msg = "The number of features in data*" np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, bad_X_test) np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csr_matrix(bad_X_test)) np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csc_matrix(bad_X_test)) with open(tname, "w+b") as f: dump_svmlight_file(bad_X_test, y_test, f) np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname) with open(tname, "w+b") as f: dump_svmlight_file(X_test, y_test, f, zero_based=False) np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname) os.remove(tname)
# test_len, ]).tocsr() print X.shape print X_t.shape skf = KFold(n_splits=5, shuffle=True, random_state=seed).split(X) for ind_tr, ind_te in skf: X_train = X[ind_tr] X_test = X[ind_te] y_train = y[ind_tr] y_test = y[ind_te] break dump_svmlight_file(X, y, inDir + "/input/X_tfidf.svm") del X dump_svmlight_file(X_t, np.zeros(X_t.shape[0]), inDir + "/input/X_t_tfidf.svm") del X_t def oversample(X_ot, y, p=0.165): pos_ot = X_ot[y == 1] neg_ot = X_ot[y == 0] #p = 0.165 scale = ((pos_ot.shape[0] * 1.0 / (pos_ot.shape[0] + neg_ot.shape[0])) / p) - 1 while scale > 1: neg_ot = ssp.vstack([neg_ot, neg_ot]).tocsr() scale -= 1 neg_ot = ssp.vstack([neg_ot,
imax = COL_LIMIT if COL_LIMIT < Y.shape[1] else Y.shape[1] for i in range(imax): i = 53 y = Y[:,i] took_indexes = (y != -1).nonzero()[0] print "Got indexes" if len(took_indexes) < 1000: continue y = y[took_indexes] x = X[took_indexes] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=0) print "Dumping" datasets.dump_svmlight_file(x_train, y_train, "tmp/train%d" % i) datasets.dump_svmlight_file(x_test, y_test, "tmp/test%d" % i) os.system("cd tmp && csplit -s test%d 3 && mv xx01 test%d && rm xx00" % (i, i)) os.system("cd tmp && csplit -s train%d 3 && mv xx01 train%d && rm xx00" % (i, i)) # os.system("svm-train -g 2 -c 8 -q tmp/train%d tmp/model%d" % (i, i)) os.system("svm-train -g 0.03125 -c 32 tmp/train%d tmp/model%d" % (i, i)) os.system("svm-predict tmp/test%d tmp/model%d tmp/predicted%d" % (i, i, i)) y_predicted = np.array(map((lambda n: np.float64(n)), open("tmp/predicted%d" % i).read().split("\n")[0:-1])) deltas = np.subtract(y_predicted, y_test) rms = np.sqrt(np.mean(deltas**2)) print "Subject %s: RMS %f" % (y_labels[i], rms) # os.system("rm tmp/test%d tmp/train%d tmp/predicted%d" % (i, i, i)) DELTAS.append(deltas)
#pred_train = np.hstack(( pred_train, np.reshape(pred_label_train6,(-1,1)) )) #pred_test = np.hstack(( pred_test, np.reshape(pred_label_test6,(-1,1)) )) #vw print "Vowpal Wabbit" ss = StandardScaler() train1_data_norm = ss.fit_transform(train1_data) train2_data_norm = ss.transform(train2_data) test_data_norm = ss.transform(test_data) ntrain1_label = train_label1.copy() ntrain1_label.values[np.where(ntrain1_label == 0)] = -1 ntrain2_label = train_label2.copy() ntrain2_label.values[np.where(ntrain2_label == 0)] = -1 dump_svmlight_file(train1_data, ntrain1_label, "train1.vw", zero_based=False) dump_svmlight_file(train2_data, ntrain2_label, "train2.vw", zero_based=False) dump_svmlight_file(test_data, np.zeros((test_data_norm.shape[0], )), "test.vw", zero_based=False) print 1 of = open("vw_train1set.csv", "w") of2 = open("vw_train2set.csv", "w") of3 = open("vw_testset.csv", "w") fi = open("train1.vw", "r")
logger.info("complete train %s feature extraction" % column_list[column_list.index(key)]) print(column_list[column_list.index(key)] + " " + "trainset vocab shape: " + str(features_train.shape)) #data format logger.info("start data format %s" % column_list[column_list.index(key)]) train_features_save_path = "./classifier_3_train_features_svm_format_files/" + column_list[ column_list.index(key)] if not os.path.exists(train_features_save_path): os.makedirs(train_features_save_path) dump_svmlight_file(features_train, label_train, train_features_save_path + '/' + column_list[column_list.index(key)] + '.txt', zero_based=True, comment=None, query_id=None) logger.info("complete data format %s" % column_list[column_list.index(key)]) logger.info("complete all data format") logger.info("start train model") for column in column_list: model_save_path = "./model_files_classifier_3/" + column if not os.path.exists(model_save_path): os.makedirs(model_save_path) subprocess.call( "./thundersvm-master/build/bin/thundersvm-train -c 100 -g 0.5 " + "./classifier_3_train_features_svm_format_files/" + column + "/" +
def dump_svmlight_file(self, file): data = np.array(self.data) X = data[:, 0:2] y = data[:, 2] dump_svmlight_file(X, y, file)
from sklearn.datasets import dump_svmlight_file from sklearn.externals import joblib from sklearn.metrics import precision_score iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # use DMatrix for xgbosot dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) # use svmlight file for xgboost dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True) dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True) dtrain_svm = xgb.DMatrix('dtrain.svm') dtest_svm = xgb.DMatrix('dtest.svm') # set xgboost params param = { 'max_depth': 3, # the maximum depth of each tree 'eta': 0.3, # the training step for each iteration 'silent': 1, # logging mode - quiet 'objective': 'multi:softprob', # error evaluation for multiclass training 'num_class': 3} # the number of classes that exist in this datset num_round = 20 # the number of training iterations #------------- numpy array ------------------ # training and testing - numpy matrices
datasets = ['webspam_u'] for dataset in datasets: if os.path.isfile('/home/neyo/PycharmProjects/AUC/datasets/%s' % (dataset)): print('Loading dataset = %s......' % (dataset), end=' ') X, y = load_svmlight_file( '/home/neyo/PycharmProjects/AUC/datasets/%s' % (dataset)) print('Done! Converting to binary......', end=' ') m = np.mean(y) INDEX = np.argwhere(y > m) index = np.argwhere(y <= m) y[INDEX] = 1 y[index] = -1 print('Done! Normalizing......', end=' ') X = preprocessing.normalize(X) print('Done! Dumping into file......', end=' ') dump_svmlight_file( X, y, '/home/neyo/PycharmProjects/AUC/bi-datasets/%s' % (dataset), zero_based=False) print('Done!') else: pass
n_classes = 2 split_params = {'test_size': 0.2, 'random_state': seed} X, y = datasets.make_classification(n_samples=n_samples, class_sep=0.4, n_features=n_features, n_classes=n_classes, random_state=seed) x_train, x_test, y_train, y_test = model_selection.train_test_split( X, y, **split_params) # Save data in .svm format train_path = os.path.abspath('x_train.svm') test_path = os.path.abspath('x_test.svm') datasets.dump_svmlight_file(x_train, y_train, train_path) datasets.dump_svmlight_file(x_test, y_test, test_path) clf = XGBClassifier(port=8085, nclasses=n_classes) booster_params = { "max_depth": 10, "subsample": 0.8, "eta": 0.3, "drop_rate": 0.4, "skip_drop": 0.4 } clf.fit([train_path, test_path], booster="dart", iterations=20, **booster_params)
x_test = [cv.preprocessor(x) for x in x_test] x_train_pre = x_train x_test_pre = x_test y_train = [float(y) for y in y_train] # float para permitir utilizar no classificador y_test = [float(y) for y in y_test] union = Pipeline([( 'features', FeatureUnion(transformer_list=[( 'tfdif_features', Pipeline([('word', TfidfVectorizer(ngram_range=(1, 2))) #, ]))]))]) x_train = union.fit_transform(x_train) print('fold ' + str(index) + ', x_train.shape: ', x_train.shape) dump_svmlight_file( x_train, y_train, "dataset/representations/" + name_dataset + '/train' + str(index)) x_test = union.transform(x_test) print('fold ' + str(index) + ', x_test.shape: ', x_test.shape) dump_svmlight_file( x_test, y_test, "dataset/representations/" + name_dataset + '/test' + str(index)) print("Time End: %f" % (timeit.default_timer() - ini))
df, pd.DataFrame(array, columns=map(lambda x: 'ATTRS' + str(x), range(100))) ], axis=1) df.drop("ATTRS_STR", axis=1, inplace=True) del array # ---------- 将类别转换为one-hot ---------- df = pd.concat([df, pd.get_dummies(df['CATID'], prefix='CATID')], axis=1) df.drop('CATID', axis=1, inplace=True) # ---------- 扔掉前三列ID ---------- df.drop(['ID', 'ADVID', 'GOODSID'], axis=1, inplace=True) print df.head() # 写出文件 #with open('goods_vectors_newnew.csv', 'a') as f: # for row in df.values: # f.write(",".join(map(str, row)) + '\n') #sqldf = sqlContext.createDataFrame(df) #sqldf.save(path='/user/mjoys/goods_vectors_new', mode='overwrite') print df.info() print("Writing...") # 将数据转化成libsvm from sklearn.datasets import dump_svmlight_file dump_svmlight_file(df.values, df.index.values, 'goods_vectors.libsvm') print("Done")
def tovw(x, y=None, sample_weight=None, convert_labels=False): """Convert array or sparse matrix to Vowpal Wabbit format Parameters ---------- x : {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : {array-like}, shape (n_samples,), optional Target vector relative to X. sample_weight : {array-like}, shape (n_samples,), optional sample weight vector relative to X. convert_labels : {bool} convert labels of the form [0,1] to [-1,1] Returns ------- out : {array-like}, shape (n_samples, 1) Training vectors in VW string format Examples -------- >>> import pandas as pd >>> from sklearn.feature_extraction.text import HashingVectorizer >>> from vowpalwabbit.sklearn_vw import tovw >>> X = pd.Series(['cat', 'dog', 'cat', 'cat'], name='catdog') >>> y = pd.Series([-1, 1, -1, -1], name='label') >>> hv = HashingVectorizer() >>> hashed = hv.fit_transform(X) >>> tovw(x=hashed, y=y) """ use_truth = y is not None use_weight = sample_weight is not None if use_truth: x, y = check_X_y(x, y, accept_sparse=True) else: x = check_array(x, accept_sparse=True) if use_weight: sample_weight = check_array(sample_weight, accept_sparse=False, ensure_2d=False, dtype=np.int, order="C") if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (x.shape[0], ): raise ValueError("Sample weight shape == {}, expected {}".format( sample_weight.shape, (x.shape[0], ))) else: sample_weight = np.ones(x.shape[0], dtype=np.int) # convert labels of the form [0,1] to [-1,1] if convert_labels: y = np.where(y < 1, -1, 1) rows, cols = x.shape # check for invalid characters if array has string values if x.dtype.char == 'S': for row in rows: for col in cols: x[row, col] = INVALID_CHARS.sub('.', x[row, col]) # convert input to svmlight format s = io.BytesIO() dump_svmlight_file(x, np.zeros(rows), s) # parse entries to construct VW format rows = s.getvalue().decode('ascii').split('\n')[:-1] out = [] for idx, row in enumerate(rows): truth = y[idx] if use_truth else 1 weight = sample_weight[idx] features = row.split('0 ', 1)[1] # only using a single namespace and no tags out.append(('{y} {w} |{ns} {x}'.format(y=truth, w=weight, ns=DEFAULT_NS, x=features))) s.close() return out
import argparse from sklearn.datasets import load_svmlight_file, dump_svmlight_file import scipy.sparse as sp if __name__ == '__main__': parser = argparse.ArgumentParser(description='Concatenates datasets by features.') parser.add_argument('datasets', metavar='data', type=str, nargs='+', help='datasets which will be concatenate') parser.add_argument('-o', '--output', type=str, help='file to sava the concatenated datasets') args = parser.parse_args() X_out = None for data in args.datasets: X, y = load_svmlight_file(data) X_out = X if X_out is None else sp.hstack((X_out, X)) if not args.output is None: dump_svmlight_file(X_out, y, args.output)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.datasets import dump_svmlight_file import numpy as np import os y = [0,0,0,0] i = 0 vectorizer = CountVectorizer() X = vectorizer.fit_transform(pre_svm) transformer = TfidfTransformer() tfidf = transformer.fit_transform(X) f = open('/Users/arrowlittle/Desktop/data/wiki_libsvm.txt', 'w') dump_svmlight_file(tfidf, y, f, zero_based=False) f.close() print tfidf.toarray() #split data from pyspark.ml.classification import NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator data = spark.read.format("libsvm") \ .load("/Users/arrowlittle/Desktop/data/wiki_libsvm.txt") splits = data.randomSplit([0.9, 0.1], 1234) train = splits[0]
]).tocsr() print X.shape print X_t.shape skf = KFold(n_splits=5, shuffle=True, random_state=SEED).split(X) for ind_tr, ind_te in skf: X_train = X[ind_tr] X_test = X[ind_te] y_train = y[ind_tr] y_test = y[ind_te] break dump_svmlight_file(X,y,PATH+"X_tfidf.svm") del X dump_svmlight_file(X_t,np.zeros(X_t.shape[0]),PATH+"X_t_tfidf.svm") del X_t def oversample(X_ot,y,p=0.175): pos_ot = X_ot[y==1] neg_ot = X_ot[y==0] #p = 0.165 scale = ((pos_ot.shape[0]*1.0 / (pos_ot.shape[0] + neg_ot.shape[0])) / p) - 1 while scale > 1: neg_ot = ssp.vstack([neg_ot, neg_ot]).tocsr() scale -=1 neg_ot = ssp.vstack([neg_ot, neg_ot[:int(scale * neg_ot.shape[0])]]).tocsr() ot = ssp.vstack([pos_ot, neg_ot]).tocsr() y=np.zeros(ot.shape[0])
def run(self, x_train, y_train, x_test, y_test, x_validation_set=None, y_validation_set=None, meta=None): """Run factorization machine model against train and test data Parameters ---------- x_train : {array-like, matrix}, shape = [n_train, n_features] Training data y_train : numpy array of shape [n_train] Target values x_test: {array-like, matrix}, shape = [n_test, n_features] Testing data y_test : numpy array of shape [n_test] Testing target values x_validation_set: optional, {array-like, matrix}, shape = [n_train, n_features] Validation data (only for SGDA) y_validation_set: optional, numpy array of shape [n_train] Validation target data (only for SGDA) meta: optional, numpy array of shape [n_features] Grouping input variables Return ------- Returns `namedtuple` with the following properties: predictions: array [n_samples of x_test] Predicted target values per element in x_test. global_bias: float If k0 is True, returns the model's global bias w0 weights: array [n_features] If k1 is True, returns the model's weights for each features Wj pairwise_interactions: numpy matrix [n_features x k2] Matrix with pairwise interactions Vj,f rlog: pandas dataframe [nrow = num_iter] `pandas` DataFrame with measurements about each iteration """ from sklearn.datasets import dump_svmlight_file TMP_SUFFIX = '.pywfm' train_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path) test_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path) out_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path) model_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path) # converts train and test data to libSVM format dump_svmlight_file(x_train, y_train, train_fd) train_fd.seek(0) dump_svmlight_file(x_test, y_test, test_fd) test_fd.seek(0) # builds arguments array args = [ os.path.join(self.__libfm_path, "libFM"), '-task', "%s" % self.__task, '-train', "%s" % train_fd.name, '-test', "%s" % test_fd.name, '-dim', "'%s'" % self.__dim, '-init_stdev', "%g" % self.__init_stdev, '-iter', "%d" % self.__num_iter, '-method', "%s" % self.__learning_method, '-out', "%s" % out_fd.name, '-verbosity', "%d" % self.__verbose, '-save_model', "%s" % model_fd.name ] # appends rlog if true rlog_fd = None if self.__rlog: rlog_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path) args.extend(['-rlog', "%s" % rlog_fd.name]) # appends seed if given if self.__seed: args.extend(['-seed', "%d" % self.__seed]) # appends arguments that only work for certain learning methods if self.__learning_method in ['sgd', 'sgda']: args.extend(['-learn_rate', "%.5f" % self.__learn_rate]) if self.__learning_method in ['sgd', 'sgda', 'als']: args.extend(['-regular', "'%s'" % self.__regularization]) # adds validation if sgda # if validation_set is none, libFM will throw error hence, I'm not doing any validation validation_fd = None if self.__learning_method == 'sgda' and ( x_validation_set is not None and y_validation_set is not None): validation_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path) dump_svmlight_file(x_validation_set, y_validation_set, validation_fd.name) args.extend(['-validation', "%s" % validation_fd.name]) # if meta data is given meta_fd = None if meta is not None: meta_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path, text=True) # write group ids for group_id in meta: meta_fd.write("%s\n" % group_id) args.extend(['-meta', "%s" % meta_fd.name]) meta_fd.seek(0) # if silent redirects all output stdout = None if self.__silent: stdout = open(os.devnull, 'wb') # call libfm with parsed arguments # had unkown bug with "-dim" option on array. At the time was forced to # concatenate string `args = ' '.join(args)` but looks like its working # needs further tests subprocess.call(args, shell=False, stdout=stdout) # reads output file preds = [float(p) for p in out_fd.read().split('\n') if p] # "hidden" feature that allows users to save the model # We use this to get the feature weights # https://github.com/srendle/libfm/commit/19db0d1e36490290dadb530a56a5ae314b68da5d import numpy as np global_bias = None weights = [] pairwise_interactions = [] # if 0 its global bias; if 1, weights; if 2, pairwise interactions out_iter = 0 for line in model_fd.read().splitlines(): # checks which line is starting with # if line.startswith('#'): if "#global bias W0" in line: out_iter = 0 elif "#unary interactions Wj" in line: out_iter = 1 elif "#pairwise interactions Vj,f" in line: out_iter = 2 else: # check context get in previous step and adds accordingly if out_iter == 0: global_bias = float(line) elif out_iter == 1: weights.append(float(line)) elif out_iter == 2: try: pairwise_interactions.append( [float(x) for x in line.split(' ')]) except ValueError as e: pairwise_interactions.append( 0.0) #Case: no pairwise interactions used pairwise_interactions = np.matrix(pairwise_interactions) # parses rlog into dataframe if self.__rlog: # parses rlog into import pandas as pd rlog_fd.seek(0) print os.stat(rlog_fd.name).st_size rlog = pd.read_csv(rlog_fd.name, sep='\t') rlog_fd.close() else: rlog = None if self.__learning_method == 'sgda' and ( x_validation_set is not None and y_validation_set is not None): validation_fd.close() if meta is not None: meta_fd.close() # removes temporary output file after using train_fd.close() test_fd.close() model_fd.close() out_fd.close() # return as named collection for multiple output import collections fm = collections.namedtuple('model', [ 'predictions', 'global_bias', 'weights', 'pairwise_interactions', 'rlog' ]) return fm(preds, global_bias, weights, pairwise_interactions, rlog)
def test_feature_importances(self): data = np.random.randn(100, 5) target = np.array([0, 1] * 50) dump_svmlight_file(data, target, temp_name) xgb.encrypt_file(temp_name, temp_enc_name, sym_key_file) features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'] dm = xgb.DMatrix({username: temp_enc_name}, feature_names=features) params = {'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'eta': 0.3, 'num_class': 3} bst = xgb.train(params, dm, num_boost_round=10) # number of feature importances should == number of features scores1 = bst.get_score() scores2 = bst.get_score(importance_type='weight') scores3 = bst.get_score(importance_type='cover') scores4 = bst.get_score(importance_type='gain') scores5 = bst.get_score(importance_type='total_cover') scores6 = bst.get_score(importance_type='total_gain') assert len(scores1) == len(features) assert len(scores2) == len(features) assert len(scores3) == len(features) assert len(scores4) == len(features) assert len(scores5) == len(features) assert len(scores6) == len(features) # check backwards compatibility of get_fscore fscores = bst.get_fscore() assert scores1 == fscores dtrain = xgb.DMatrix({username: dpath + 'agaricus.txt.train.enc'}) dtest = xgb.DMatrix({username: dpath + 'agaricus.txt.test.enc'}) def fn(max_depth, num_rounds): # train params = {'max_depth': max_depth, 'eta': 1, 'verbosity': 0} bst = xgb.train(params, dtrain, num_boost_round=num_rounds) # predict preds = bst.predict(dtest)[0] contribs = bst.predict(dtest, pred_contribs=True)[0] # result should be (number of features + BIAS) * number of rows assert contribs.shape == (dtest.num_row(), dtest.num_col() + 1) # sum of contributions should be same as predictions np.testing.assert_array_almost_equal(np.sum(contribs, axis=1), preds) # for max_depth, num_rounds in itertools.product(range(0, 3), range(1, 5)): # yield fn, max_depth, num_rounds # check that we get the right SHAP values for a basic AND example # (https://arxiv.org/abs/1706.06060) X = np.zeros((4, 2)) X[0, :] = 1 X[1, 0] = 1 X[2, 1] = 1 y = np.zeros(4) y[0] = 1 param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0} dump_svmlight_file(X, y, temp_name) xgb.encrypt_file(temp_name, temp_enc_name, sym_key_file) bst = xgb.train(param, xgb.DMatrix({username: temp_enc_name}), 1) dump_svmlight_file(X[0:1, :], np.zeros(1), temp_name) xgb.encrypt_file(temp_name, temp_enc_name, sym_key_file) out = bst.predict(xgb.DMatrix({username: temp_enc_name}), pred_contribs=True)[0] #TODO(rishabh): enable pred_contribs """ assert out[0, 0] == 0.375 assert out[0, 1] == 0.375 assert out[0, 2] == 0.25 """ def parse_model(model): trees = [] r_exp = r"([0-9]+):\[f([0-9]+)<([0-9\.e-]+)\] yes=([0-9]+),no=([0-9]+).*cover=([0-9e\.]+)" r_exp_leaf = r"([0-9]+):leaf=([0-9\.e-]+),cover=([0-9e\.]+)" for tree in model.get_dump(with_stats=True): lines = list(tree.splitlines()) trees.append([None for i in range(len(lines))]) for line in lines: match = re.search(r_exp, line) if match is not None: ind = int(match.group(1)) while ind >= len(trees[-1]): trees[-1].append(None) trees[-1][ind] = { "yes_ind": int(match.group(4)), "no_ind": int(match.group(5)), "value": None, "threshold": float(match.group(3)), "feature_index": int(match.group(2)), "cover": float(match.group(6)) } else: match = re.search(r_exp_leaf, line) ind = int(match.group(1)) while ind >= len(trees[-1]): trees[-1].append(None) trees[-1][ind] = { "value": float(match.group(2)), "cover": float(match.group(3)) } return trees def exp_value_rec(tree, z, x, i=0): if tree[i]["value"] is not None: return tree[i]["value"] else: ind = tree[i]["feature_index"] if z[ind] == 1: if x[ind] < tree[i]["threshold"]: return exp_value_rec(tree, z, x, tree[i]["yes_ind"]) else: return exp_value_rec(tree, z, x, tree[i]["no_ind"]) else: r_yes = tree[tree[i]["yes_ind"]]["cover"] / tree[i]["cover"] out = exp_value_rec(tree, z, x, tree[i]["yes_ind"]) val = out * r_yes r_no = tree[tree[i]["no_ind"]]["cover"] / tree[i]["cover"] out = exp_value_rec(tree, z, x, tree[i]["no_ind"]) val += out * r_no return val def exp_value(trees, z, x): return np.sum([exp_value_rec(tree, z, x) for tree in trees]) def all_subsets(ss): return itertools.chain(*map(lambda x: itertools.combinations(ss, x), range(0, len(ss) + 1))) def shap_value(trees, x, i, cond=None, cond_value=None): M = len(x) z = np.zeros(M) other_inds = list(set(range(M)) - set([i])) if cond is not None: other_inds = list(set(other_inds) - set([cond])) z[cond] = cond_value M -= 1 total = 0.0 for subset in all_subsets(other_inds): if len(subset) > 0: z[list(subset)] = 1 v1 = exp_value(trees, z, x) z[i] = 1 v2 = exp_value(trees, z, x) total += (v2 - v1) / (scipy.special.binom(M - 1, len(subset)) * M) z[i] = 0 z[list(subset)] = 0 return total def shap_values(trees, x): vals = [shap_value(trees, x, i) for i in range(len(x))] vals.append(exp_value(trees, np.zeros(len(x)), x)) return np.array(vals) def interaction_values(trees, x): M = len(x) out = np.zeros((M + 1, M + 1)) for i in range(len(x)): for j in range(len(x)): if i != j: out[i, j] = interaction_value(trees, x, i, j) / 2 svals = shap_values(trees, x) main_effects = svals - out.sum(1) out[np.diag_indices_from(out)] = main_effects return out def interaction_value(trees, x, i, j): M = len(x) z = np.zeros(M) other_inds = list(set(range(M)) - set([i, j])) total = 0.0 for subset in all_subsets(other_inds): if len(subset) > 0: z[list(subset)] = 1 v00 = exp_value(trees, z, x) z[i] = 1 v10 = exp_value(trees, z, x) z[j] = 1 v11 = exp_value(trees, z, x) z[i] = 0 v01 = exp_value(trees, z, x) z[j] = 0 total += (v11 - v01 - v10 + v00) / (scipy.special.binom(M - 2, len(subset)) * (M - 1)) z[list(subset)] = 0 return total # test a simple and function M = 2 N = 4 X = np.zeros((N, M)) X[0, :] = 1 X[1, 0] = 1 X[2, 1] = 1 y = np.zeros(N) y[0] = 1 param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0} #TODO(rishabh): enable pred_contribs """
def combine_feat(feat_names, feat_path_name): print("==================================================") print("Combine features...") ###################### ## Cross-validation ## ###################### print("For cross-validation...") ## for each run and fold for run in range(1, config.n_runs + 1): ## use 33% for training and 67 % for validation ## so we switch trainInd and validInd for fold in range(1, config.n_folds + 1): print("Run: %d, Fold: %d" % (run, fold)) path = "%s/Run%d/Fold%d" % (config.feat_folder, run, fold) save_path = "%s/%s/Run%d/Fold%d" % (config.feat_folder, feat_path_name, run, fold) if not os.path.exists(save_path): os.makedirs(save_path) for i, (feat_name, transformer) in enumerate(feat_names): ## load train feat feat_train_file = "%s/train.%s.feat.pkl" % (path, feat_name) with open(feat_train_file, "rb") as f: x_train = cPickle.load(f) if len(x_train.shape) == 1: x_train.shape = (x_train.shape[0], 1) ## load valid feat feat_valid_file = "%s/valid.%s.feat.pkl" % (path, feat_name) with open(feat_valid_file, "rb") as f: x_valid = cPickle.load(f) if len(x_valid.shape) == 1: x_valid.shape = (x_valid.shape[0], 1) ## align feat dim dim_diff = abs(x_train.shape[1] - x_valid.shape[1]) if x_valid.shape[1] < x_train.shape[1]: x_valid = hstack( [x_valid, np.zeros((x_valid.shape[0], dim_diff))]).tocsr() elif x_valid.shape[1] > x_train.shape[1]: x_train = hstack( [x_train, np.zeros((x_train.shape[0], dim_diff))]).tocsr() ## apply transformation x_train = transformer.fit_transform(x_train) x_valid = transformer.transform(x_valid) ## stack feat if i == 0: X_train, X_valid = x_train, x_valid else: try: X_train, X_valid = hstack([X_train, x_train]), hstack( [X_valid, x_valid]) except: X_train, X_valid = np.hstack( [X_train, x_train]), np.hstack([X_valid, x_valid]) print("Combine {:>2}/{:>2} feat: {} ({}D)".format( i + 1, len(feat_names), feat_name, x_train.shape[1])) print("Feat dim: {}D".format(X_train.shape[1])) ## load label # train info_train = pd.read_csv("%s/train.info" % (save_path)) ## change it to zero-based for multi-classification in xgboost Y_train = info_train["median_relevance"] - 1 # valid info_valid = pd.read_csv("%s/valid.info" % (save_path)) Y_valid = info_valid["median_relevance"] - 1 ## dump feat dump_svmlight_file(X_train, Y_train, "%s/train.feat" % (save_path)) dump_svmlight_file(X_valid, Y_valid, "%s/valid.feat" % (save_path)) ########################## ## Training and Testing ## ########################## print("For training and testing...") path = "%s/All" % (config.feat_folder) save_path = "%s/%s/All" % (config.feat_folder, feat_path_name) if not os.path.exists(save_path): os.makedirs(save_path) for i, (feat_name, transformer) in enumerate(feat_names): ## load train feat feat_train_file = "%s/train.%s.feat.pkl" % (path, feat_name) with open(feat_train_file, "rb") as f: x_train = cPickle.load(f) if len(x_train.shape) == 1: x_train.shape = (x_train.shape[0], 1) ## load test feat feat_test_file = "%s/test.%s.feat.pkl" % (path, feat_name) with open(feat_test_file, "rb") as f: x_test = cPickle.load(f) if len(x_test.shape) == 1: x_test.shape = (x_test.shape[0], 1) ## align feat dim dim_diff = abs(x_train.shape[1] - x_test.shape[1]) if x_test.shape[1] < x_train.shape[1]: x_test = hstack([x_test, np.zeros((x_test.shape[0], dim_diff))]).tocsr() elif x_test.shape[1] > x_train.shape[1]: x_train = hstack([x_train, np.zeros((x_train.shape[0], dim_diff))]).tocsr() ## apply transformation x_train = transformer.fit_transform(x_train) x_test = transformer.transform(x_test) ## stack feat if i == 0: X_train, X_test = x_train, x_test else: try: X_train, X_test = hstack([X_train, x_train]), hstack([X_test, x_test]) except: X_train, X_test = np.hstack([X_train, x_train ]), np.hstack([X_test, x_test]) print("Combine {:>2}/{:>2} feat: {} ({}D)".format( i + 1, len(feat_names), feat_name, x_train.shape[1])) print("Feat dim: {}D".format(X_train.shape[1])) ## load label # train info_train = pd.read_csv("%s/train.info" % (save_path)) ## change it to zero-based for multi-classification in xgboost Y_train = info_train["median_relevance"] - 1 # test info_test = pd.read_csv("%s/test.info" % (save_path)) Y_test = info_test["median_relevance"] - 1 ## dump feat dump_svmlight_file(X_train, Y_train, "%s/train.feat" % (save_path)) dump_svmlight_file(X_test, Y_test, "%s/test.feat" % (save_path))
def test_dump(): X_sparse, y_dense = load_svmlight_file(datafile) X_dense = X_sparse.toarray() y_sparse = sp.csr_matrix(y_dense) # slicing a csr_matrix can unsort its .indices, so test that we sort # those correctly X_sliced = X_sparse[np.arange(X_sparse.shape[0])] y_sliced = y_sparse[np.arange(y_sparse.shape[0])] for X in (X_sparse, X_dense, X_sliced): for y in (y_sparse, y_dense, y_sliced): for zero_based in (True, False): for dtype in [np.float32, np.float64, np.int32, np.int64]: f = BytesIO() # we need to pass a comment to get the version info in; # LibSVM doesn't grok comments so they're not put in by # default anymore. if (sp.issparse(y) and y.shape[0] == 1): # make sure y's shape is: (n_samples, n_labels) # when it is sparse y = y.T # Note: with dtype=np.int32 we are performing unsafe casts, # where X.astype(dtype) overflows. The result is # then platform dependent and X_dense.astype(dtype) may be # different from X_sparse.astype(dtype).asarray(). X_input = X.astype(dtype) dump_svmlight_file(X_input, y, f, comment="test", zero_based=zero_based) f.seek(0) comment = f.readline() comment = str(comment, "utf-8") assert "scikit-learn %s" % sklearn.__version__ in comment comment = f.readline() comment = str(comment, "utf-8") assert ["one", "zero"][zero_based] + "-based" in comment X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert X2.dtype == dtype assert_array_equal(X2.sorted_indices().indices, X2.indices) X2_dense = X2.toarray() if sp.issparse(X_input): X_input_dense = X_input.toarray() else: X_input_dense = X_input if dtype == np.float32: # allow a rounding error at the last decimal place assert_array_almost_equal( X_input_dense, X2_dense, 4) assert_array_almost_equal( y_dense.astype(dtype, copy=False), y2, 4) else: # allow a rounding error at the last decimal place assert_array_almost_equal( X_input_dense, X2_dense, 15) assert_array_almost_equal( y_dense.astype(dtype, copy=False), y2, 15)
from sklearn.datasets import make_regression from sklearn.datasets import dump_svmlight_file import numpy as np X, y = make_regression( **{ 'n_samples': 1000000, 'n_features': 50, 'n_informative': 4, 'n_targets': 1, 'random_state': 37 }) output = 'regression.binary' # data = dump_svmlight_file(X, y, output) data = dump_svmlight_file(X, y, output, zero_based=False) print(data)
def create_svm_file(df, features_X, path): """ Convert a pandas DF into a lib-SVM format! Args: - df (pd DF) : Pandas DF - needs to have the columns, that are passed in the features_X argument! - features_X (list) : list of strings with the names of the attributes in df we want to use as feature! - path (string) : path (incl. DF_Name) to save the lib-SVM at! [on top of "data/processed/Ranking/tf_ranking/"] e.g. "CV/1.txt" Return: - save the LIB-SVM DF - return the true responses of the passed df (used to calc the F1-Score later on!) """ # [1] Check Input: # - all feature names in df, so we can select the corresponding cols? for _feat in features_X: if _feat not in df.columns.values: raise ValueError(_feat + "is not a columname in df") # - features_X must start with "transport_mode" # (needed to create valid predicitons) if features_X[0] != "transport_mode": raise ValueError("'features_X' must start with 'transport_mode'") # - is the path, already defined? # split folder and file name and check for the existence of the folder folder = path.split("/") folder = folder[:(len(folder) - 1)] folder = "".join(folder) if not os.path.isdir("data/processed/Ranking/tf_ranking/" + str(folder)): raise ValueError( str(folder) + " is not existent in 'data/processed/Ranking/tf_ranking/'") # [2] Clean the DF and get it ready! # - Sort the SIDs df.sort_values("sid", inplace=True) # - drop rows, that have the same trans_mode multiple times for a single sid! df = df.drop_duplicates(['sid', 'transport_mode'], keep='first') # [3] Create ranking target # - if click_mode we mark the target with "1" and the irrelevant ones as "0" if 'click_mode' in df.columns: print("Build LTR labels") # T1 for target <--> 0 else df = df.assign(target=df.apply( lambda x: 1 if x.click_mode == x.transport_mode else 0, axis=1)) else: # If test set every entry gets zeri for a label print("Assign label 0 for test set") df = df.assign(target=0) # [4] Spit the DF into Target & Feature + extract the SIDs # - we pass these to svm-converter to create a libsvm DF! X = df[features_X] y = df["target"] query_id = df.sid path = "data/processed/Ranking/tf_ranking/" + str(path) # [5] Save the SVM_File on top of: "data/processed/Ranking/tf_ranking" print("Dump file") dump_svmlight_file(X=X, y=y, f=path, query_id=query_id, zero_based=False) # [6] Return the Values of the true click_modes [needed for metrics!] return np.array(df.drop_duplicates(["sid"]).click_mode)