Exemplo n.º 1
0
def subsample_to_file(svm_file, out_dir, out_name, multilabel=False,
                      row_ratio=0.5, col_ratio=0.3, random_state=12):
  """
  Example:

  '''python
     # run the following command in the current directory will create a
     # `tmp` folder, if not already exists, and generate a file called
     # `a9a_sub` from the original file `./data/a9a`. Both files are
     # in libsvm format.
     subsample_to_file("./data/a9a", "./tmp", "a9a_sub")
     # read the subsampled file and make sure its number of rows is half of
     # that of a9a and its number of column is roughly third of a9a (123)
     X, y = load_svmlight_file('./tmp/a9a_sub')
     assert X.shape == (16280, 36)
  '''

  """
  assert 1 >= row_ratio > 0, \
         "Row ratio {row_ratio} must be (0, 1]" \
         .format(**locals())
  assert 1 >= col_ratio > 0, \
         "Col ratio {col_ratio} must be (0, 1]" \
         .format(**locals())
  X, y = load_svmlight_file(svm_file, multilabel=multilabel)
  n, m = X.shape
  subn = int(n*row_ratio)
  subm = int(m*col_ratio)
  rst = np.random.RandomState(random_state)
  ridx = rst.choice(n, subn, replace=False)
  cidx = rst.choice(m, subm, replace=False)
  mkdir_p(out_dir)
  out_file = os.path.join(out_dir, out_name)
  dump_svmlight_file(X[ridx,:][:,cidx], y[ridx],
                     out_file, multilabel=multilabel)
Exemplo n.º 2
0
def save_libfm(X_sprs_mat, y_array, f):
    
    print("Save LibFM Format")
    
    dump_svmlight_file(X_sprs_mat, y_array, f)
    
    return
def test_load_with_offsets(sparsity, n_samples, n_features):
    rng = np.random.RandomState(0)
    X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features))
    if sparsity:
        X[X < sparsity] = 0.0
    X = sp.csr_matrix(X)
    y = rng.randint(low=0, high=2, size=n_samples)

    f = BytesIO()
    dump_svmlight_file(X, y, f)
    f.seek(0)

    size = len(f.getvalue())

    # put some marks that are likely to happen anywhere in a row
    mark_0 = 0
    mark_1 = size // 3
    length_0 = mark_1 - mark_0
    mark_2 = 4 * size // 5
    length_1 = mark_2 - mark_1

    # load the original sparse matrix into 3 independent CSR matrices
    X_0, y_0 = load_svmlight_file(f, n_features=n_features,
                                  offset=mark_0, length=length_0)
    X_1, y_1 = load_svmlight_file(f, n_features=n_features,
                                  offset=mark_1, length=length_1)
    X_2, y_2 = load_svmlight_file(f, n_features=n_features,
                                  offset=mark_2)

    y_concat = np.concatenate([y_0, y_1, y_2])
    X_concat = sp.vstack([X_0, X_1, X_2])
    assert_array_almost_equal(y, y_concat)
    assert_array_almost_equal(X.toarray(), X_concat.toarray())
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    for X in (Xs, Xd):
        for zero_based in (True, False):
            for dtype in [np.float32, np.float64]:
                f = BytesIO()
                dump_svmlight_file(X.astype(dtype), y, f, zero_based=zero_based)
                f.seek(0)

                comment = f.readline()
                assert_in("scikit-learn %s" % sklearn.__version__, comment)
                comment = f.readline()
                assert_in(["one", "zero"][zero_based] + "-based", comment)

                X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based)
                assert_equal(X2.dtype, dtype)
                if dtype == np.float32:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype),
                        X2.toarray(),
                        4,
                    )
                else:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype),
                        X2.toarray(),
                        15,
                    )
                assert_array_equal(y, y2)
    def batch_fit(self, Xs, ys, dump=True):
        qids = [np.array([i] * len(ys[i])) for i in range(len(ys))]
        print "dumping data to Xtrain.data"
        if dump:
            dump_svmlight_file(
                np.concatenate(Xs), np.concatenate(ys), "Xtrain.data", zero_based=False, query_id=np.concatenate(qids)
            )

        print "now learning"

        print call(
            [
                self.path + "svm_hmm_learn",
                "-c",
                "%d" % self.C,
                "--t",
                "%d" % self.t,
                "--e",
                "%d" % self.e,
                "Xtrain.data",
                "svmhmm-model.dat",
            ]
        )

        return self
Exemplo n.º 6
0
    def fit(self, X, Y):

        self.labels=list(set(Y))
        if len(self.labels) > 2 :
            self.multiclass=True
            #print 'multiclass'
        else:
            self.multiclass=False
            
        self.train_fname =self.base_str +'-svmcmd-train' +  '.dat'
        self.model_fname =self.train_fname + '.model'
        dump_svmlight_file(X,Y,self.train_fname ,zero_based=False)
        if self.multiclass:
            command_line=path_to_train_program+'gtsvm_initialize {0} -f {1} -o {2}  -m 1 '.format(self.param_str, self.train_fname , self.model_fname )
        else:
            command_line=path_to_train_program+'gtsvm_initialize -f {1} -o {2} {0}'.format(self.param_str, self.train_fname , self.model_fname )
        args = shlex.split(command_line)
        p = subprocess.Popen(args)
        p.wait()
        command_line=path_to_train_program+'gtsvm_optimize -i {0} -o {1} -e {2} -n {3}'.format(self.model_fname,self.model_fname,self.tol,self.max_iter)    
        args = shlex.split(command_line)
        p = subprocess.Popen(args,stderr=subprocess.PIPE)
        p.wait()
        opt_err_str=p.stderr.read() ##gtsvm is too buggy
        if len(opt_err_str) < 1: 
            command_line=path_to_train_program+'gtsvm_shrink -i {0}  -o {1}'.format(self.model_fname,self.model_fname)
            args = shlex.split(command_line)
            p = subprocess.Popen(args)
            p.wait()
            self.train_fail=False
        else :
            self.train_fail=True
            
        
        return self
Exemplo n.º 7
0
 def predict(self, X):
     if isinstance(X,list):
         self.test_n_sample=len(X)
     else:
         self.test_n_sample=X.shape[0]
     Y=[1]*self.test_n_sample
     self.test_fname =self.base_str +'-svmcmd-test' +  '.dat'
     self.predict_fname =self.base_str +'-svmcmd-predict' +  '.dat'
     dump_svmlight_file(X,Y,self.test_fname ,zero_based=False)
     command_line=path_to_train_program+'gtsvm_classify -f {0}  -i {1} -o {2}'.format(self.test_fname , self.model_fname, self.predict_fname )
     args = shlex.split(command_line)
     p = subprocess.Popen(args)
     p.wait()
     if self.train_fail:
         return [max(self.labels)+1]*self.test_n_sample
     
     if self.multiclass : 
         f = open(self.predict_fname, 'rb')
         self.predicted_weight = map(lambda row: map(float,row), list(csv.reader(f)))
         f.close()
         Y_predict=map(np.argmax, self.predicted_weight)
     else :
         self.predicted_weight = np.loadtxt( self.predict_fname)
         Y_predict=map(int,map(round,self.predicted_weight))
     return Y_predict
Exemplo n.º 8
0
def generate_weekday_newbuyer_exposure(df):
    """
    加入新客数,曝光数
    """
    X = df[['uv_0612_0618', 'uv_weekday', 'uv_weekend', 'no_subsidy_exposure', 'newbuyer_6_18']]
    y = df.uv_0626_0702
    dump_svmlight_file(X, y, './uv_weekday_weekend_newbuyer_exposure_without_outliers.dat')
Exemplo n.º 9
0
def generate_week(df):
    """
    生成1维特征
    """
    X = df[['uv_0612_0618']]
    y = df.uv_0626_0702
    dump_svmlight_file(X, y, './uv_week.dat')
Exemplo n.º 10
0
def executa_extracao_n(base_treino, metodo, n=1):
    inicio = time()    
    
    lista_imagens = arq.busca_arquivos(base_treino, "*.png")
    n_imgs_treino = len(lista_imagens)
    
    for lado in range(8,n+1,4):
        atributos = []    
        rotulos = []     
            
        arq_treino = base_treino + "base_PFTAS_"+str(lado)+"x"+str(lado)+".svm"
        ##  INICIO DO PROCESSO DE EXTRACAO DE ATRIBUTOS    
        
        for arq_imagem in lista_imagens: 
            print("Arquivo: " + arq_imagem)
            imagem = mh.imread(arq_imagem) 
            if (imagem != None):
                classe, _ = ex.classe_arquivo(arq_imagem)             
                print("executa_extracao_n - shape imagem:" + str(imagem.shape))
                # Extrai os atributos e gera os arquivos dos patches da base de treino
                atrs,rots = extrai_pftas_patches_n(imagem, classe, lado)                            
                atributos += atrs
                rotulos += rots
        
        dump_svmlight_file(atributos, rotulos, arq_treino)
    
    log("Extraidos atributos da base " + base_treino + " utilizando " + metodo + "\n para " + str(n_imgs_treino) + "imagens") 
  
    # Exibe o tempo de execução    
    log(str(time()-inicio) + "EXTRAÇÃO")     
Exemplo n.º 11
0
def generate_weekday_weekend(df):
    """
    生成3维特征
    """
    X = df[['uv_0612_0618', 'uv_weekday', 'uv_weekend']]
    y = df.uv_0626_0702
    dump_svmlight_file(X, y, './uv_weekday_weekend.dat')
Exemplo n.º 12
0
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    for X in (Xs, Xd):
        for zero_based in (True, False):
            for dtype in [np.float32, np.float64]:
                f = BytesIO()
                # we need to pass a comment to get the version info in;
                # LibSVM doesn't grok comments so they're not put in by
                # default anymore.
                dump_svmlight_file(X.astype(dtype), y, f, comment="test",
                                   zero_based=zero_based)
                f.seek(0)

                comment = f.readline()
                assert_in("scikit-learn %s" % sklearn.__version__, comment)
                comment = f.readline()
                assert_in(["one", "zero"][zero_based] + "-based", comment)

                X2, y2 = load_svmlight_file(f, dtype=dtype,
                                            zero_based=zero_based)
                assert_equal(X2.dtype, dtype)
                if dtype == np.float32:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype), X2.toarray(), 4)
                else:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype), X2.toarray(), 15)
                assert_array_equal(y, y2)
Exemplo n.º 13
0
 def save_all_data_in_svmlight_format(self,
                                      file_path,
                                      extraction_method,
                                      label_type):
     label_list, feature_vector_list = self.extract_all_data(extraction_method, label_type)
     with open(file_path, 'wb') as f:
         datasets.dump_svmlight_file(feature_vector_list, label_list, f)
Exemplo n.º 14
0
def dump_svmlight(X_matrix, Y, feature_names, output_filename, feature_id_offset = 0):
  dump_svmlight_file(X_matrix, Y, output_filename)
  contents = None
  with open(output_filename) as output_file:
    contents = '#' + ' '.join(feature_names) + '\n' + ''.join(output_file.readlines())
  with open(output_filename, 'w') as output_file:
    output_file.write(contents)
Exemplo n.º 15
0
    def data_dump(self, f, X_train, X_test, y_train, y_test):
        from sklearn.datasets import dump_svmlight_file
        ddd = dict()
        new_y_train = []
        last = 0
        for yy in y_train:
            if yy in ddd:
                yy = (ddd[yy])
            else:
                ddd[yy] = last
                yy = last
                last += 1
            new_y_train.append(yy)

        dump_svmlight_file(X_train, new_y_train, f + ".svmlight.train")
        
        new_y_test = []
        for yy in y_test:
            if yy in ddd:
                yy = (ddd[yy])
            else:
                ddd[yy] = last
                yy = last
                last += 1
            new_y_test.append(yy)
        
        dump_svmlight_file(X_test, new_y_test, f + ".svmlight.test")
def load_training_data(file_location=str, load_from_database=False, limit=int(1000), clean_dataset=True):
    """
    If ```load_from_database``` is True, retrieves and stores data from database to file.

    Arguments:
        file_location (str): Path + filename of libsvm file to save/load (e.g. 'training_data')
        load_from_database (bool): Should data be retrieved from database?
        limit (int): Amount of records to retrieve from database (default=1000)
        clean_dataset (bool): Should questions be cleaned (e.g. remove code samples, hexadecimals, numbers, etc)?

    Returns:
         (pandas.DataFrame.from_csv, sklearn.datasets.load_svmlight_file):
         Tuple containing a pandas.DataFrame (all data retrieved from database) and
         tuple with training data (load_svmlight_file)

    See:
        | ```MySQLDatabase().retrieve_training_data```
        | ```pandas.DataFrame.to_csv```
        | ```pandas.DataFrame.from_csv```
        | ```sklearn.datasets.dump_svmlight_file```
        | ```sklearn.datasets.load_svmlight_file```
    """
    svm_file = file_location + ".dat"
    csv_file = file_location + ".csv"
    if load_from_database:
        comment = u"label: (-1: Bad question, +1: Good question); features: (term_id, frequency)"
        MySQLDatabase().set_vote_value_params()
        data = MySQLDatabase().retrieve_training_data(limit, clean_dataset)
        # create a term-document matrix
        vectorizer = CountVectorizer(analyzer='word', min_df=0.01, stop_words="english")
        td_matrix = vectorizer.fit_transform(data.get(QUESTION_TEXT_KEY))
        data.to_csv(csv_file)
        dump_svmlight_file(td_matrix, data[CLASS_LABEL_KEY], f=svm_file, comment=comment)
    return DataFrame.from_csv(csv_file), load_svmlight_file(svm_file)
def test_dump_comment():
    X, y = load_svmlight_file(datafile)
    X = X.toarray()

    f = BytesIO()
    ascii_comment = "This is a comment\nspanning multiple lines."
    dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False)
    f.seek(0)

    X2, y2 = load_svmlight_file(f, zero_based=False)
    assert_array_almost_equal(X, X2.toarray())
    assert_array_equal(y, y2)

    # XXX we have to update this to support Python 3.x
    utf8_comment = "It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc"
    f = BytesIO()
    assert_raises(UnicodeDecodeError, dump_svmlight_file, X, y, f, comment=utf8_comment)

    unicode_comment = utf8_comment.decode("utf-8")
    f = BytesIO()
    dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False)
    f.seek(0)

    X2, y2 = load_svmlight_file(f, zero_based=False)
    assert_array_almost_equal(X, X2.toarray())
    assert_array_equal(y, y2)

    f = BytesIO()
    assert_raises(ValueError, dump_svmlight_file, X, y, f, comment="I've got a \0.")
def test_load_with_long_qid():
    # load svmfile with longint qid attribute
    data = b("""
    1 qid:0 0:1 1:2 2:3
    0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985
    0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985
    3 qid:9223372036854775807  0:1440446648 1:72048431380967004 2:236784985""")
    X, y, qid = load_svmlight_file(BytesIO(data), query_id=True)

    true_X = [[1,          2,                 3],
             [1440446648, 72048431380967004, 236784985],
             [1440446648, 72048431380967004, 236784985],
             [1440446648, 72048431380967004, 236784985]]

    true_y = [1, 0, 0, 3]
    trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807]
    assert_array_equal(y, true_y)
    assert_array_equal(X.toarray(), true_X)
    assert_array_equal(qid, trueQID)

    f = BytesIO()
    dump_svmlight_file(X, y, f, query_id=qid, zero_based=True)
    f.seek(0)
    X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True)
    assert_array_equal(y, true_y)
    assert_array_equal(X.toarray(), true_X)
    assert_array_equal(qid, trueQID)

    f.seek(0)
    X, y = load_svmlight_file(f, query_id=False, zero_based=True)
    assert_array_equal(y, true_y)
    assert_array_equal(X.toarray(), true_X)
Exemplo n.º 19
0
def test_dump_concise():
    one = 1
    two = 2.1
    three = 3.01
    exact = 1.000000000000001
    # loses the last decimal place
    almost = 1.0000000000000001
    X = [[one, two, three, exact, almost],
         [1e9, 2e18, 3e27, 0, 0],
         [0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0]]
    y = [one, two, three, exact, almost]
    f = BytesIO()
    dump_svmlight_file(X, y, f)
    f.seek(0)
    # make sure it's using the most concise format possible
    assert_equal(f.readline(),
                 b("1 0:1 1:2.1 2:3.01 3:1.000000000000001 4:1\n"))
    assert_equal(f.readline(), b("2.1 0:1000000000 1:2e+18 2:3e+27\n"))
    assert_equal(f.readline(), b("3.01 \n"))
    assert_equal(f.readline(), b("1.000000000000001 \n"))
    assert_equal(f.readline(), b("1 \n"))
    f.seek(0)
    # make sure it's correct too :)
    X2, y2 = load_svmlight_file(f)
    assert_array_almost_equal(X, X2.toarray())
    assert_array_equal(y, y2)
Exemplo n.º 20
0
def create_train_test(n_samples, doc2vec, save_svmlight=True):

    print "Creating train & test sets..."

    # Create labelled data arrays.

    data = np.zeros((n_samples, doc2vec.size))
    labels = np.zeros(n_samples)

    for i in range(n_samples / 2):

        prefix_train_pos = 'TRAIN_POS_' + str(i)
        prefix_train_neg = 'TRAIN_NEG_' + str(i)

        data[i] = doc2vec.model.docvecs[prefix_train_pos]
        data[n_samples / 2 + i] = doc2vec.model.docvecs[prefix_train_neg]

        labels[i] = 1

    # Split in train and validation arrays.

    train, test, train_labels, test_labels = train_test_split(
        data, labels, test_size=0.3, random_state=42)

    if save_svmlight:

        current_path = os.path.abspath(
            os.path.join(os.getcwd(), os.pardir))

        dump_svmlight_file(train, train_labels, current_path + "/Data/Processed/TrainSet.svm")
        dump_svmlight_file(test, test_labels, current_path + "/Data/Processed/TestSet.svm")

    return train, test, train_labels, test_labels
Exemplo n.º 21
0
def pair_vectors(pairs, features, words, output_path):
    vectorizer = DictVectorizer()
    vectors = vectorizer.fit_transform(x[1] for x in features)

    vector_map = {word:vector for word, vector in
                  itertools.izip((x[0].split('/')[0] for x in features),
                                 vectors)}

    # Positive examples
    positive = []
    record = []
    for specific, general in pairs:
        positive.append(vector_map[general] - vector_map[specific])
        record.append( (specific, general, 1) )

    pair_set = set([tuple(x) for x in pairs])
    non_positive = []
    for i in range(len(positive)):
        first = second = None
        while first == second or (first, second) in pair_set:
            first = words[random.randint(len(words))]
            second = words[random.randint(len(words))]
        non_positive.append(vector_map[second] - vector_map[first])
        record.append( (first, second, 0) )
    
    data = vstack(positive + non_positive)
    target = [1]*len(positive) + [0]*len(non_positive)
    
    # Save dataset
    with open(os.path.join(output_path,'wn-noun-dependencies.mat'), 'wb') as data_file:
        dump_svmlight_file(data, target, data_file)

    with open(os.path.join(output_path,'wn-noun-dependencies.json'), 'w') as record_file:
        json.dump(record, record_file)
Exemplo n.º 22
0
 def predict(self,X) :
     # write test file to specific format
     dump_svmlight_file(X,np.zeros(X.shape[0]),self._test_file_name,zero_based=True)
     # call _exectute_prediction
     self._execute_prediction(self._test_file_name)
     # import the output of the script and return it
     predictions = pd.read_csv(self._temp_pred_file_name,header = None)
     return predictions
Exemplo n.º 23
0
def download_mnist():
    training, test = fetch_mnist(data_home=dataset_dir)

    X, y = training
    datasets.dump_svmlight_file(X, y, join(dataset_dir, "mnist"))

    X, y = test
    datasets.dump_svmlight_file(X, y, join(dataset_dir, "mnist.t"))
Exemplo n.º 24
0
def save_svmlight(x, y, path):
    LOG.debug("saving svmlight to %s", path)
    ensure_exist(path)
    _, n = x.shape
    with open(os.path.join(path, "nfeature.txt"), "wb") as f:
        f.write(str(n))

    dump_svmlight_file(x, y, os.path.join(path, "data"), zero_based=False)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_iter=100, dim=4, lrate=.1, n_fold=5):

    feature_name = os.path.basename(train_file)[:-8]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG, filename='libfm_{}_{}_{}_{}.log'.format(
                                                        n_iter, dim, lrate,
                                                        feature_name
                                                      ))

    logging.info('Loading training data')
    X, y = load_svmlight_file(train_file)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p = np.zeros_like(y)
    lloss = 0.
    for i_trn, i_val in cv:
        now = datetime.now().strftime('%Y%m%d-%H%M%S')
        valid_train_file = '/tmp/libfm_train_{}_{}.sps'.format(feature_name, now)
        valid_test_file = '/tmp/libfm_valid_{}_{}.sps'.format(feature_name, now)
        valid_predict_file = '/tmp/libfm_predict_{}_{}.sps'.format(feature_name, now)

        dump_svmlight_file(X[i_trn], y[i_trn], valid_train_file,
                           zero_based=False)
        dump_svmlight_file(X[i_val], y[i_val], valid_test_file,
                           zero_based=False)

        subprocess.call(["libFM",
                         "-task", "c",
                         '-dim', '1,1,{}'.format(dim),
                         '-init_stdev', str(lrate),
                         '-iter', str(n_iter),
                         '-train', valid_train_file,
                         '-test', valid_test_file,
                         '-out', valid_predict_file])

        p[i_val] = np.loadtxt(valid_predict_file)
        lloss += log_loss(y[i_val], p[i_val])

        os.remove(valid_train_file)
        os.remove(valid_test_file)
        os.remove(valid_predict_file)

    logging.info('Log Loss = {:.4f}'.format(lloss / n_fold))
    np.savetxt(predict_valid_file, p, fmt='%.6f')

    logging.info('Retraining with 100% data...')
    subprocess.call(["libFM",
                     "-task", "c",
                     '-dim', '1,1,{}'.format(dim),
                     '-init_stdev', str(lrate),
                     '-iter', str(n_iter),
                     '-train', train_file,
                     '-test', test_file,
                     '-out', predict_test_file])
Exemplo n.º 26
0
def get_train_and_test_spaese_matrix():
    Y = pd.read_csv('data/train_Y.csv', index_col='user_id')['type']
    train_X = combine_all_behavior()
    # dump_svmlight_file(train_X,Y,'data/train_metrix')
    dump_svmlight_file(train_X,Y,'data/train_metrix_3')
    test_X = combine_all_behavior(is_train=False)
    test_Y = [0]*(test_X.shape[0])
    # dump_svmlight_file(test_X,test_Y,'data/test_metrix')
    dump_svmlight_file(test_X,test_Y,'data/test_metrix_3')
def test_dump():
    X_sparse, y_dense = load_svmlight_file(datafile)
    X_dense = X_sparse.toarray()
    y_sparse = sp.csr_matrix(y_dense)

    # slicing a csr_matrix can unsort its .indices, so test that we sort
    # those correctly
    X_sliced = X_sparse[np.arange(X_sparse.shape[0])]
    y_sliced = y_sparse[np.arange(y_sparse.shape[0])]

    for X in (X_sparse, X_dense, X_sliced):
        for y in (y_sparse, y_dense, y_sliced):
            for zero_based in (True, False):
                for dtype in [np.float32, np.float64, np.int32]:
                    f = BytesIO()
                    # we need to pass a comment to get the version info in;
                    # LibSVM doesn't grok comments so they're not put in by
                    # default anymore.

                    if (sp.issparse(y) and y.shape[0] == 1):
                        # make sure y's shape is: (n_samples, n_labels)
                        # when it is sparse
                        y = y.T

                    dump_svmlight_file(X.astype(dtype), y, f, comment="test",
                                       zero_based=zero_based)
                    f.seek(0)

                    comment = f.readline()
                    comment = str(comment, "utf-8")

                    assert_in("scikit-learn %s" % sklearn.__version__, comment)

                    comment = f.readline()
                    comment = str(comment, "utf-8")

                    assert_in(["one", "zero"][zero_based] + "-based", comment)

                    X2, y2 = load_svmlight_file(f, dtype=dtype,
                                                zero_based=zero_based)
                    assert_equal(X2.dtype, dtype)
                    assert_array_equal(X2.sorted_indices().indices, X2.indices)

                    X2_dense = X2.toarray()

                    if dtype == np.float32:
                        # allow a rounding error at the last decimal place
                        assert_array_almost_equal(
                            X_dense.astype(dtype), X2_dense, 4)
                        assert_array_almost_equal(
                            y_dense.astype(dtype), y2, 4)
                    else:
                        # allow a rounding error at the last decimal place
                        assert_array_almost_equal(
                            X_dense.astype(dtype), X2_dense, 15)
                        assert_array_almost_equal(
                            y_dense.astype(dtype), y2, 15)
Exemplo n.º 28
0
def project(file_name, dimensions):
    data = load_svmlight_file(file_name)
    projector = SparseRandomProjection(dimensions, 1/3.0,
                                       dense_output=True)
    projected = projector.fit_transform(data[0])
    
    new_file_name = file_name[:-4] + '-' + str(dimensions) + '.mat'
    new_file = open(new_file_name, 'wb')
    dump_svmlight_file(projected, data[1], new_file)
Exemplo n.º 29
0
Arquivo: bidnn.py Projeto: v-v/BiDNN
    def save_output(self, X, epoch=None):
        # write output to file
        if epoch is not None:
            fname_out = self.conf.fname_out.replace('%e', str(epoch).zfill(5))
        else:
            fname_out = self.conf.fname_out.replace('%e', 'final')

        if self.conf.verbosity > 1:
            print "Saving output to", fname_out, "..."
        dump_svmlight_file(X, self.tl, fname_out)
Exemplo n.º 30
0
def glimpse_to_svmlight(input_file, train_file, test_file):
  with open(input_file) as fh:
    exp = pickle.load(fh)
  ftrs = ExtractFeatures(Layer.C2, exp.extractor.activation)
  trng = GetTrainingSet(exp)
  dump_svmlight_file(ftrs[trng], exp.corpus.labels[trng] + 1, train_file, zero_based=False)
  dump_svmlight_file(ftrs[~trng], exp.corpus.labels[~trng] + 1, test_file, zero_based=False)
  print "Categories"
  print "----------"
  print "\n".join("%d - %s" % (index+1,name) for (index,name) in enumerate(exp.corpus.class_names))
Exemplo n.º 31
0
 folds = preprocess.create_folds(X, y, queries, 5)
 fold_number = 1
 C_array = [0.1, 0.01, 0.001]
 # model_handler = mh.models_handler(C_array)
 validated = set()
 scores = {}
 models = {}
 for train, test in folds:
     evaluator.empty_validation_files()
     validated, validation_set, train_set = preprocess.create_validation_set(
         5, validated, set(train), number_of_queries, queries)
     train_file = "train" + str(fold_number) + ".txt"
     run_command("rm " + train_file)
     dump_svmlight_file(X[train],
                        y[train],
                        train_file,
                        query_id=queries[train],
                        zero_based=False)
     for C in C_array:
         model_file = learn_svm(C, train_file, fold_number)
         weights = recover_model(model_file)
         svm = s.svm_sgd(C)
         svm.w = weights
         score_file = svm.predict(X, queries, validation_set, evaluator,
                                  True)
         score = evaluator.run_trec_eval(score_file, qrels_file)
         scores[svm.C] = score
         models[svm.C] = svm
     max_C = max(scores.items(), key=operator.itemgetter(1))[0]
     chosen_model = models[max_C]
     chosen_model.predict(X, queries, test, evaluator)
Exemplo n.º 32
0
def random(data, d_sub, name, what, array, R, n_label):
    np.set_printoptions(threshold=np.inf, suppress=True)
    data = scipy.sparse.coo_matrix.tocsr(data)
    label = data[:, 0]
    data = sklearn.preprocessing.maxabs_scale(data[:, 1:])
    n_sample, d_feature = np.shape(data)
    a = math.sqrt(float(d_feature / d_sub)) * data
    #
    # pca = PCA(n_components=d_sub).fit_transform(a)
    #
    # label = (np.asarray(label)).reshape(-1)
    # dump_svmlight_file(pca, label, 'other_method/kmeans_linear/%s/%s_pca' % (name, what), zero_based=False)

    train2 = a[:, R]
    dump_svmlight_file(train2,
                       label,
                       'other_method/kmeans_linear/%s/%s_random' %
                       (name, what),
                       zero_based=False)

    # func = KMeans(n_clusters=d_sub)
    # transform = a.T
    # kmeans = func.fit(transform)
    # klabel = kmeans.labels_
    # R_means = []
    # for i in range(d_sub):
    #     temp1 = []
    #     temp2 = []
    #     for idx, val in enumerate(klabel):
    #         if val == i:
    #             temp1.append(idx)
    #             distance = func.transform(transform[idx])
    #             temp2.append(distance[0, val])
    #
    #     idx_temp = temp2.index(min(temp2))
    #     R_means.append(temp1[idx_temp])
    # R_means = sorted(R_means)
    # train2 = a[:,R_means]
    # label = (np.asarray(label)).reshape(-1)
    # dump_svmlight_file(train2, label, 'other_method/kmeans_linear/%s/%s_means' % (name, what),
    #                    zero_based=False)
    # b = a
    # transform = b.T
    # var = np.var(transform, axis=1)
    # topn = np.argsort(var, axis=0)
    # topn = np.reshape(topn, (1, -1))
    # topn = np.asarray(topn)
    # R_variace = topn[0, d_feature - d_sub:]
    # R_variace = sorted(R_variace)
    # R_variace = np.asarray(R_variace)
    # train2 = a[:,R_variace]
    # label = (np.asarray(label)).reshape(-1)
    # dump_svmlight_file(train2, label, 'other_method/kmeans_linear/%s/%s_variance' % (name, what),
    #                    zero_based=False)
    b = a[:n_label]
    transform = b.T
    norm = linalg.norm(transform, axis=1)
    # print(norm)
    topn = np.argsort(norm, axis=0)
    # print(topn)
    topn = np.reshape(topn, (1, -1))
    topn = np.asarray(topn)
    R_norm = topn[0, d_feature - d_sub:]
    R_norm = sorted(R_norm)
    R_norm = np.asarray(R_norm)
    train2 = a[:, R_norm]
    # exit()
    dump_svmlight_file(train2,
                       label,
                       'other_method/kmeans_linear/%s/%s_norm' % (name, what),
                       zero_based=False)
Exemplo n.º 33
0
def save_as_svmlight(M, yid, fname):
    dump_svmlight_file(M[:, [x for x in range(M.shape[1]) if x != yid]],
                       M[:, yid], fname)
        dev_text_token_list = []
        for text in dev_review_list:
            dev_text_token_list.append(Counter(set(text)))

        test_text_token_list = []
        for text in test_review_list:
            test_text_token_list.append(Counter(set(text)))

    print('Save Data')
    #Save csr_matrix data into libsvm
    train_review_matrix = review_to_csr_matrix(train_text_token_list,
                                               feature_list, feature_num)
    train_stars_matrix = stars_to_csr_matrix(train_stars_list, class_num)
    dump_svmlight_file(train_review_matrix,
                       train_stars_matrix,
                       str(method + '_train.libsvm'),
                       multilabel=True)
    print('Data saved: {}'.format(str(method + '_train.libsvm')))

    eval_review_matrix = review_to_csr_matrix(eval_text_token_list,
                                              feature_list, feature_num)
    eval_stars_matrix = stars_to_csr_matrix(eval_stars_list, class_num)
    dump_svmlight_file(eval_review_matrix,
                       eval_stars_matrix,
                       str(method + '_eval.libsvm'),
                       multilabel=True)
    print('Data saved: {}'.format(str(method + '_eval.libsvm')))

    dev_review_matrix = review_to_csr_matrix(dev_text_token_list, feature_list,
                                             feature_num)
    dev_stars_matrix = stars_to_csr_matrix(dev_stars_list, class_num)
Exemplo n.º 35
0
# -*- coding: utf-8 -*-
"""
Created on Sun Sep  9 00:18:22 2018

@author: Nitin
"""
"""
from sklearn.datasets import load_svmlight_file, dump_svmlight_file
file = "../data/usps/usps-train.dat"
X,y = load_svmlight_file(file)
dump_svmlight_file(X, y, file, zero_based=True, comment=None, query_id=None, multilabel=False)

file = "../data/usps/usps-test.dat"
X,y = load_svmlight_file(file)
dump_svmlight_file(X, y, file, zero_based=True, comment=None, query_id=None, multilabel=False)

for i in range(10):
    file = "../data/usps/t_{}.dat".format(i)
    X,y = load_svmlight_file(file)
    dump_svmlight_file(X, y, file, zero_based=True, comment=None, query_id=None, multilabel=False)
"""

# -*- coding: utf-8 -*-
"""
Created on Fri Feb 23 23:20:58 2018

File to load and process the USPS data into a binary classification format.
@author: Nitin
"""

import numpy as np
Exemplo n.º 36
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from sklearn.datasets import dump_svmlight_file
import pandas as pd
import numpy as np

df = pd.read_csv("transfered.csv")

y_data = np.array(df['is_malware'])
del df['is_malware']
del df['sha1']

X_data = np.array(df)

dump_svmlight_file(X_data,
                   y_data,
                   'apk_libsvm.dat',
                   zero_based=False,
                   multilabel=False)
Exemplo n.º 37
0
    model_file = os.path.join(data_path, "sofml.model")
    training_file = os.path.join(data_path, "train_data.dat")
    training2_file = os.path.join(data_path, "train2_data.dat")
    test_file = os.path.join(data_path, "test_data.dat")
    pred_train2_file = os.path.join(data_path, "pred_train2.csv")
    pred_test_file = os.path.join(data_path, "pred_test.csv")

    # write out traindata and testdata to svmlight format
    print "writing out files"
    ntrain1_label = train1_label.copy()
    ntrain1_label.values[np.where(ntrain1_label == 0)] = -1
    ntrain2_label = train2_label.copy()
    ntrain2_label.values[np.where(ntrain2_label == 0)] = -1
    dump_svmlight_file(train1_data_norm,
                       ntrain1_label,
                       training_file,
                       zero_based=False)
    dump_svmlight_file(train2_data_norm,
                       ntrain2_label,
                       training2_file,
                       zero_based=False)
    dump_svmlight_file(test_data_norm,
                       np.zeros((test_data_norm.shape[0], )),
                       test_file,
                       zero_based=False)

    # train
    #print "training sofia"
    call(
        sofiaml_path +
        " --learner_type sgd-svm --loop_type roc --prediction_type logistic --iterations 200000 --lambda 10000 --training_file "
Exemplo n.º 38
0
def generate_feature(train_file, test_file, object_file, train_feature_file,
                     test_feature_file):

    logging.info('loading input data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)
    obj = pd.read_csv(object_file, header=None)
    obj.columns = ['course_id', 'object', 'category', 'children', 'start']

    n_trn = trn.shape[0]

    trn.time = pd.to_datetime(trn.time)
    tst.time = pd.to_datetime(tst.time)

    df = pd.concat([trn, tst], axis=0)

    # get last dates of courses
    last_date = df[['course_id', 'time']].groupby('course_id',
                                                  as_index=False).max()
    last_date.columns = ['course_id', 'last_date']

    # extract object information
    obj.children.fillna('', inplace=True)
    obj['n_children'] = obj.children.apply(
        lambda x: int(np.log2(1 + len(x.split()))))
    obj.start.replace('null', '1999-01-01 00:00:00', inplace=True)
    obj.start = pd.to_datetime(obj.start)
    obj = pd.merge(obj, last_date, on='course_id', how='left')

    obj['obj_days_before_last_date'] = (
        obj.last_date - obj.start).apply(lambda x: pd.Timedelta(x).days)
    obj.ix[obj.obj_days_before_last_date > 30,
           'obj_days_before_last_date'] = 30

    # merge log data with last coursedate and object information
    df = pd.merge(df, last_date, on='course_id', how='left')
    df = pd.merge(
        df,
        obj[['object', 'category', 'n_children', 'obj_days_before_last_date']],
        on='object',
        how='left')

    df['days_before_last_date'] = (
        df.last_date - df.time).apply(lambda x: pd.Timedelta(x).days)
    df['weeks_before_last_date'] = df.days_before_last_date // 7
    df.ix[df.weeks_before_last_date == 4, 'weeks_before_last_date'] = 3
    df['last_month'] = df.last_date.apply(lambda x: x.month)

    df['obj_10_days_after_last_date'] = df.obj_days_before_last_date.apply(
        lambda x: 1 if x < 0 and x >= -10 else 0)
    df.obj_days_before_last_date = df.obj_days_before_last_date.apply(
        lambda x: np.sign(x) * int(np.log2(1 + np.sign(x) * x)) \
                  if ~pd.isnull(x) else x
            )

    df.drop(['time', 'last_date'], axis=1, inplace=True)
    df.set_index('enrollment_id', inplace=True)

    X = encode_categorical_features(df, n=n_trn, min_obs=100, nan_as_var=True)
    X = X.tocsr()

    dump_svmlight_file(X[:n_trn],
                       trn.enrollment_id.values,
                       train_feature_file,
                       zero_based=False)
    dump_svmlight_file(X[n_trn:],
                       tst.enrollment_id.values,
                       test_feature_file,
                       zero_based=False)
Exemplo n.º 39
0
def dump_data(x, y, file_output):
    datasets.dump_svmlight_file(x, y, file_output)
    os.remove("%s_tmp" % file_output)
Exemplo n.º 40
0
    def test(self):
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True),
                                                            test_size=0.1, random_state=2)
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = train_data.create_valid(X_test, label=y_test)

        params = {
            "objective": "binary",
            "metric": "auc",
            "min_data": 10,
            "num_leaves": 15,
            "verbose": -1,
            "num_threads": 1,
            "max_bin": 255,
            "gpu_use_dp": True
        }
        bst = lgb.Booster(params, train_data)
        bst.add_valid(valid_data, "valid_1")

        for i in range(20):
            bst.update()
            if i % 10 == 0:
                print(bst.eval_train(), bst.eval_valid())

        self.assertEqual(bst.current_iteration(), 20)
        self.assertEqual(bst.num_trees(), 20)
        self.assertEqual(bst.num_model_per_iteration(), 1)
        self.assertAlmostEqual(bst.lower_bound(), -2.9040190126976606)
        self.assertAlmostEqual(bst.upper_bound(), 3.3182142872462883)

        bst.save_model("model.txt")
        pred_from_matr = bst.predict(X_test)
        with tempfile.NamedTemporaryFile() as f:
            tname = f.name
        with open(tname, "w+b") as f:
            dump_svmlight_file(X_test, y_test, f)
        pred_from_file = bst.predict(tname)
        os.remove(tname)
        np.testing.assert_allclose(pred_from_matr, pred_from_file)

        # check saved model persistence
        bst = lgb.Booster(params, model_file="model.txt")
        os.remove("model.txt")
        pred_from_model_file = bst.predict(X_test)
        # we need to check the consistency of model file here, so test for exact equal
        np.testing.assert_array_equal(pred_from_matr, pred_from_model_file)

        # check early stopping is working. Make it stop very early, so the scores should be very close to zero
        pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
        pred_early_stopping = bst.predict(X_test, **pred_parameter)
        # scores likely to be different, but prediction should still be the same
        np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping))

        # test that shape is checked during prediction
        bad_X_test = X_test[:, 1:]
        bad_shape_error_msg = "The number of features in data*"
        np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
                                       bst.predict, bad_X_test)
        np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
                                       bst.predict, sparse.csr_matrix(bad_X_test))
        np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
                                       bst.predict, sparse.csc_matrix(bad_X_test))
        with open(tname, "w+b") as f:
            dump_svmlight_file(bad_X_test, y_test, f)
        np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
                                       bst.predict, tname)
        with open(tname, "w+b") as f:
            dump_svmlight_file(X_test, y_test, f, zero_based=False)
        np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
                                       bst.predict, tname)
        os.remove(tname)
Exemplo n.º 41
0
    #    test_len,
]).tocsr()

print X.shape
print X_t.shape

skf = KFold(n_splits=5, shuffle=True, random_state=seed).split(X)
for ind_tr, ind_te in skf:
    X_train = X[ind_tr]
    X_test = X[ind_te]

    y_train = y[ind_tr]
    y_test = y[ind_te]
    break

dump_svmlight_file(X, y, inDir + "/input/X_tfidf.svm")
del X
dump_svmlight_file(X_t, np.zeros(X_t.shape[0]), inDir + "/input/X_t_tfidf.svm")
del X_t


def oversample(X_ot, y, p=0.165):
    pos_ot = X_ot[y == 1]
    neg_ot = X_ot[y == 0]
    #p = 0.165
    scale = ((pos_ot.shape[0] * 1.0 /
              (pos_ot.shape[0] + neg_ot.shape[0])) / p) - 1
    while scale > 1:
        neg_ot = ssp.vstack([neg_ot, neg_ot]).tocsr()
        scale -= 1
    neg_ot = ssp.vstack([neg_ot,
Exemplo n.º 42
0
imax = COL_LIMIT if COL_LIMIT < Y.shape[1] else Y.shape[1]

for i in range(imax):
    i = 53
    y = Y[:,i]
    took_indexes = (y != -1).nonzero()[0]
    print "Got indexes"
    if len(took_indexes) < 1000: continue
    y = y[took_indexes]
    x = X[took_indexes]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=0)

    print "Dumping"
    datasets.dump_svmlight_file(x_train, y_train, "tmp/train%d" % i)
    datasets.dump_svmlight_file(x_test, y_test, "tmp/test%d" % i)
    os.system("cd tmp && csplit -s test%d 3 && mv xx01 test%d && rm xx00" % (i, i))
    os.system("cd tmp && csplit -s train%d 3 && mv xx01 train%d && rm xx00" % (i, i))
    # os.system("svm-train -g 2 -c 8 -q tmp/train%d tmp/model%d" % (i, i))
    os.system("svm-train -g 0.03125 -c 32 tmp/train%d tmp/model%d" % (i, i))
    os.system("svm-predict tmp/test%d tmp/model%d tmp/predicted%d" % (i, i, i))

    y_predicted = np.array(map((lambda n: np.float64(n)), open("tmp/predicted%d" % i).read().split("\n")[0:-1]))
    deltas = np.subtract(y_predicted, y_test)
    rms = np.sqrt(np.mean(deltas**2))

    print "Subject %s: RMS %f" % (y_labels[i], rms)
    # os.system("rm tmp/test%d tmp/train%d tmp/predicted%d" % (i, i, i))
    DELTAS.append(deltas)
    #pred_train = np.hstack(( pred_train, np.reshape(pred_label_train6,(-1,1)) ))
    #pred_test = np.hstack(( pred_test, np.reshape(pred_label_test6,(-1,1)) ))

    #vw
    print "Vowpal Wabbit"
    ss = StandardScaler()
    train1_data_norm = ss.fit_transform(train1_data)
    train2_data_norm = ss.transform(train2_data)
    test_data_norm = ss.transform(test_data)

    ntrain1_label = train_label1.copy()
    ntrain1_label.values[np.where(ntrain1_label == 0)] = -1
    ntrain2_label = train_label2.copy()
    ntrain2_label.values[np.where(ntrain2_label == 0)] = -1
    dump_svmlight_file(train1_data,
                       ntrain1_label,
                       "train1.vw",
                       zero_based=False)
    dump_svmlight_file(train2_data,
                       ntrain2_label,
                       "train2.vw",
                       zero_based=False)
    dump_svmlight_file(test_data,
                       np.zeros((test_data_norm.shape[0], )),
                       "test.vw",
                       zero_based=False)
    print 1

    of = open("vw_train1set.csv", "w")
    of2 = open("vw_train2set.csv", "w")
    of3 = open("vw_testset.csv", "w")
    fi = open("train1.vw", "r")
        logger.info("complete train %s feature extraction" %
                    column_list[column_list.index(key)])
        print(column_list[column_list.index(key)] + " " +
              "trainset vocab shape: " + str(features_train.shape))

        #data format
        logger.info("start data format %s" %
                    column_list[column_list.index(key)])
        train_features_save_path = "./classifier_3_train_features_svm_format_files/" + column_list[
            column_list.index(key)]
        if not os.path.exists(train_features_save_path):
            os.makedirs(train_features_save_path)
        dump_svmlight_file(features_train,
                           label_train,
                           train_features_save_path + '/' +
                           column_list[column_list.index(key)] + '.txt',
                           zero_based=True,
                           comment=None,
                           query_id=None)
        logger.info("complete data format %s" %
                    column_list[column_list.index(key)])
    logger.info("complete all data format")

    logger.info("start train model")
    for column in column_list:
        model_save_path = "./model_files_classifier_3/" + column
        if not os.path.exists(model_save_path):
            os.makedirs(model_save_path)
        subprocess.call(
            "./thundersvm-master/build/bin/thundersvm-train -c 100 -g 0.5 " +
            "./classifier_3_train_features_svm_format_files/" + column + "/" +
Exemplo n.º 45
0
 def dump_svmlight_file(self, file):
     data = np.array(self.data)
     X = data[:, 0:2]
     y = data[:, 2]
     dump_svmlight_file(X, y, file)
Exemplo n.º 46
0
from sklearn.datasets import dump_svmlight_file
from sklearn.externals import joblib
from sklearn.metrics import precision_score

iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# use DMatrix for xgbosot
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# use svmlight file for xgboost
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

# set xgboost params
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

#------------- numpy array ------------------
# training and testing - numpy matrices
Exemplo n.º 47
0
    datasets = ['webspam_u']

    for dataset in datasets:

        if os.path.isfile('/home/neyo/PycharmProjects/AUC/datasets/%s' %
                          (dataset)):

            print('Loading dataset = %s......' % (dataset), end=' ')
            X, y = load_svmlight_file(
                '/home/neyo/PycharmProjects/AUC/datasets/%s' % (dataset))

            print('Done! Converting to binary......', end=' ')
            m = np.mean(y)
            INDEX = np.argwhere(y > m)
            index = np.argwhere(y <= m)
            y[INDEX] = 1
            y[index] = -1

            print('Done! Normalizing......', end=' ')
            X = preprocessing.normalize(X)

            print('Done! Dumping into file......', end=' ')
            dump_svmlight_file(
                X,
                y,
                '/home/neyo/PycharmProjects/AUC/bi-datasets/%s' % (dataset),
                zero_based=False)
            print('Done!')

        else:
            pass
Exemplo n.º 48
0
n_classes = 2

split_params = {'test_size': 0.2, 'random_state': seed}

X, y = datasets.make_classification(n_samples=n_samples,
                                    class_sep=0.4,
                                    n_features=n_features,
                                    n_classes=n_classes,
                                    random_state=seed)
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, y, **split_params)

# Save data in .svm format
train_path = os.path.abspath('x_train.svm')
test_path = os.path.abspath('x_test.svm')
datasets.dump_svmlight_file(x_train, y_train, train_path)
datasets.dump_svmlight_file(x_test, y_test, test_path)

clf = XGBClassifier(port=8085, nclasses=n_classes)

booster_params = {
    "max_depth": 10,
    "subsample": 0.8,
    "eta": 0.3,
    "drop_rate": 0.4,
    "skip_drop": 0.4
}
clf.fit([train_path, test_path],
        booster="dart",
        iterations=20,
        **booster_params)
Exemplo n.º 49
0
x_test = [cv.preprocessor(x) for x in x_test]

x_train_pre = x_train
x_test_pre = x_test

y_train = [float(y)
           for y in y_train]  # float para permitir utilizar no classificador
y_test = [float(y) for y in y_test]

union = Pipeline([(
    'features',
    FeatureUnion(transformer_list=[(
        'tfdif_features',
        Pipeline([('word',
                   TfidfVectorizer(ngram_range=(1,
                                                2)))  #,                      
                  ]))]))])

x_train = union.fit_transform(x_train)
print('fold ' + str(index) + ', x_train.shape: ', x_train.shape)
dump_svmlight_file(
    x_train, y_train,
    "dataset/representations/" + name_dataset + '/train' + str(index))

x_test = union.transform(x_test)
print('fold ' + str(index) + ', x_test.shape: ', x_test.shape)
dump_svmlight_file(
    x_test, y_test,
    "dataset/representations/" + name_dataset + '/test' + str(index))
print("Time End: %f" % (timeit.default_timer() - ini))
Exemplo n.º 50
0
    df,
    pd.DataFrame(array, columns=map(lambda x: 'ATTRS' + str(x), range(100)))
],
               axis=1)
df.drop("ATTRS_STR", axis=1, inplace=True)
del array

# ---------- 将类别转换为one-hot ----------
df = pd.concat([df, pd.get_dummies(df['CATID'], prefix='CATID')], axis=1)
df.drop('CATID', axis=1, inplace=True)

# ---------- 扔掉前三列ID ----------
df.drop(['ID', 'ADVID', 'GOODSID'], axis=1, inplace=True)
print df.head()

# 写出文件
#with open('goods_vectors_newnew.csv', 'a') as f:
#        for row in df.values:
#                f.write(",".join(map(str, row)) + '\n')
#sqldf = sqlContext.createDataFrame(df)
#sqldf.save(path='/user/mjoys/goods_vectors_new', mode='overwrite')
print df.info()

print("Writing...")
# 将数据转化成libsvm
from sklearn.datasets import dump_svmlight_file

dump_svmlight_file(df.values, df.index.values, 'goods_vectors.libsvm')

print("Done")
Exemplo n.º 51
0
def tovw(x, y=None, sample_weight=None, convert_labels=False):
    """Convert array or sparse matrix to Vowpal Wabbit format

    Parameters
    ----------

    x : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    y : {array-like}, shape (n_samples,), optional
        Target vector relative to X.
    sample_weight : {array-like}, shape (n_samples,), optional
                    sample weight vector relative to X.
    convert_labels : {bool} convert labels of the form [0,1] to [-1,1]

    Returns
    -------

    out : {array-like}, shape (n_samples, 1)
          Training vectors in VW string format

    Examples
    --------

    >>> import pandas as pd
    >>> from sklearn.feature_extraction.text import HashingVectorizer
    >>> from vowpalwabbit.sklearn_vw import tovw
    >>> X = pd.Series(['cat', 'dog', 'cat', 'cat'], name='catdog')
    >>> y = pd.Series([-1, 1, -1, -1], name='label')
    >>> hv = HashingVectorizer()
    >>> hashed = hv.fit_transform(X)
    >>> tovw(x=hashed, y=y)
    """

    use_truth = y is not None
    use_weight = sample_weight is not None

    if use_truth:
        x, y = check_X_y(x, y, accept_sparse=True)
    else:
        x = check_array(x, accept_sparse=True)

    if use_weight:
        sample_weight = check_array(sample_weight,
                                    accept_sparse=False,
                                    ensure_2d=False,
                                    dtype=np.int,
                                    order="C")
        if sample_weight.ndim != 1:
            raise ValueError("Sample weights must be 1D array or scalar")
        if sample_weight.shape != (x.shape[0], ):
            raise ValueError("Sample weight shape == {}, expected {}".format(
                sample_weight.shape, (x.shape[0], )))
    else:
        sample_weight = np.ones(x.shape[0], dtype=np.int)

    # convert labels of the form [0,1] to [-1,1]
    if convert_labels:
        y = np.where(y < 1, -1, 1)

    rows, cols = x.shape

    # check for invalid characters if array has string values
    if x.dtype.char == 'S':
        for row in rows:
            for col in cols:
                x[row, col] = INVALID_CHARS.sub('.', x[row, col])

    # convert input to svmlight format
    s = io.BytesIO()
    dump_svmlight_file(x, np.zeros(rows), s)

    # parse entries to construct VW format
    rows = s.getvalue().decode('ascii').split('\n')[:-1]
    out = []
    for idx, row in enumerate(rows):
        truth = y[idx] if use_truth else 1
        weight = sample_weight[idx]
        features = row.split('0 ', 1)[1]
        # only using a single namespace and no tags
        out.append(('{y} {w} |{ns} {x}'.format(y=truth,
                                               w=weight,
                                               ns=DEFAULT_NS,
                                               x=features)))

    s.close()

    return out
import argparse
from sklearn.datasets import load_svmlight_file, dump_svmlight_file
import scipy.sparse as sp

if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Concatenates datasets by features.')
	parser.add_argument('datasets', metavar='data', type=str, nargs='+',
	                    help='datasets which will be concatenate')
	parser.add_argument('-o', '--output', type=str,
	                    help='file to sava the concatenated datasets')

	args = parser.parse_args()

	X_out = None
	for data in args.datasets:
		X, y = load_svmlight_file(data)
		X_out = X if X_out is None else sp.hstack((X_out, X))
	
	if not args.output is None:
		dump_svmlight_file(X_out, y, args.output)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.datasets import dump_svmlight_file
import numpy as np 
import os

y = [0,0,0,0]
i = 0

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(pre_svm)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
f = open('/Users/arrowlittle/Desktop/data/wiki_libsvm.txt', 'w')
dump_svmlight_file(tfidf, y, f, zero_based=False)
f.close()

print tfidf.toarray()


#split data

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

data = spark.read.format("libsvm") \
    .load("/Users/arrowlittle/Desktop/data/wiki_libsvm.txt")

splits = data.randomSplit([0.9, 0.1], 1234)
train = splits[0]
Exemplo n.º 54
0
    ]).tocsr()


print X.shape
print X_t.shape

skf = KFold(n_splits=5, shuffle=True, random_state=SEED).split(X)
for ind_tr, ind_te in skf:
    X_train = X[ind_tr]
    X_test = X[ind_te]

    y_train = y[ind_tr]
    y_test = y[ind_te]
    break

dump_svmlight_file(X,y,PATH+"X_tfidf.svm")
del X
dump_svmlight_file(X_t,np.zeros(X_t.shape[0]),PATH+"X_t_tfidf.svm")
del X_t

def oversample(X_ot,y,p=0.175):
    pos_ot = X_ot[y==1]
    neg_ot = X_ot[y==0]
    #p = 0.165
    scale = ((pos_ot.shape[0]*1.0 / (pos_ot.shape[0] + neg_ot.shape[0])) / p) - 1
    while scale > 1:
        neg_ot = ssp.vstack([neg_ot, neg_ot]).tocsr()
        scale -=1
    neg_ot = ssp.vstack([neg_ot, neg_ot[:int(scale * neg_ot.shape[0])]]).tocsr()
    ot = ssp.vstack([pos_ot, neg_ot]).tocsr()
    y=np.zeros(ot.shape[0])
Exemplo n.º 55
0
    def run(self,
            x_train,
            y_train,
            x_test,
            y_test,
            x_validation_set=None,
            y_validation_set=None,
            meta=None):
        """Run factorization machine model against train and test data

        Parameters
        ----------
        x_train : {array-like, matrix}, shape = [n_train, n_features]
            Training data
        y_train : numpy array of shape [n_train]
            Target values
        x_test: {array-like, matrix}, shape = [n_test, n_features]
            Testing data
        y_test : numpy array of shape [n_test]
            Testing target values
        x_validation_set: optional, {array-like, matrix}, shape = [n_train, n_features]
            Validation data (only for SGDA)
        y_validation_set: optional, numpy array of shape [n_train]
            Validation target data (only for SGDA)
        meta: optional, numpy array of shape [n_features]
            Grouping input variables

        Return
        -------
        Returns `namedtuple` with the following properties:

        predictions: array [n_samples of x_test]
           Predicted target values per element in x_test.
        global_bias: float
            If k0 is True, returns the model's global bias w0
        weights: array [n_features]
            If k1 is True, returns the model's weights for each features Wj
        pairwise_interactions: numpy matrix [n_features x k2]
            Matrix with pairwise interactions Vj,f
        rlog: pandas dataframe [nrow = num_iter]
            `pandas` DataFrame with measurements about each iteration
        """

        from sklearn.datasets import dump_svmlight_file

        TMP_SUFFIX = '.pywfm'
        train_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX,
                                               dir=self.__temp_path)
        test_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX,
                                              dir=self.__temp_path)
        out_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX,
                                             dir=self.__temp_path)
        model_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX,
                                               dir=self.__temp_path)

        # converts train and test data to libSVM format
        dump_svmlight_file(x_train, y_train, train_fd)
        train_fd.seek(0)
        dump_svmlight_file(x_test, y_test, test_fd)
        test_fd.seek(0)

        # builds arguments array
        args = [
            os.path.join(self.__libfm_path, "libFM"), '-task',
            "%s" % self.__task, '-train',
            "%s" % train_fd.name, '-test',
            "%s" % test_fd.name, '-dim',
            "'%s'" % self.__dim, '-init_stdev',
            "%g" % self.__init_stdev, '-iter',
            "%d" % self.__num_iter, '-method',
            "%s" % self.__learning_method, '-out',
            "%s" % out_fd.name, '-verbosity',
            "%d" % self.__verbose, '-save_model',
            "%s" % model_fd.name
        ]

        # appends rlog if true
        rlog_fd = None
        if self.__rlog:
            rlog_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX,
                                                  dir=self.__temp_path)
            args.extend(['-rlog', "%s" % rlog_fd.name])

        # appends seed if given
        if self.__seed:
            args.extend(['-seed', "%d" % self.__seed])

        # appends arguments that only work for certain learning methods
        if self.__learning_method in ['sgd', 'sgda']:
            args.extend(['-learn_rate', "%.5f" % self.__learn_rate])

        if self.__learning_method in ['sgd', 'sgda', 'als']:
            args.extend(['-regular', "'%s'" % self.__regularization])

        # adds validation if sgda
        # if validation_set is none, libFM will throw error hence, I'm not doing any validation
        validation_fd = None
        if self.__learning_method == 'sgda' and (
                x_validation_set is not None and y_validation_set is not None):
            validation_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX,
                                                        dir=self.__temp_path)
            dump_svmlight_file(x_validation_set, y_validation_set,
                               validation_fd.name)
            args.extend(['-validation', "%s" % validation_fd.name])

        # if meta data is given
        meta_fd = None
        if meta is not None:
            meta_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX,
                                                  dir=self.__temp_path,
                                                  text=True)
            # write group ids
            for group_id in meta:
                meta_fd.write("%s\n" % group_id)
            args.extend(['-meta', "%s" % meta_fd.name])
            meta_fd.seek(0)

        # if silent redirects all output
        stdout = None
        if self.__silent:
            stdout = open(os.devnull, 'wb')

        # call libfm with parsed arguments
        # had unkown bug with "-dim" option on array. At the time was forced to
        # concatenate string `args = ' '.join(args)` but looks like its working
        # needs further tests
        subprocess.call(args, shell=False, stdout=stdout)

        # reads output file
        preds = [float(p) for p in out_fd.read().split('\n') if p]

        # "hidden" feature that allows users to save the model
        # We use this to get the feature weights
        # https://github.com/srendle/libfm/commit/19db0d1e36490290dadb530a56a5ae314b68da5d
        import numpy as np
        global_bias = None
        weights = []
        pairwise_interactions = []
        # if 0 its global bias; if 1, weights; if 2, pairwise interactions
        out_iter = 0
        for line in model_fd.read().splitlines():
            # checks which line is starting with #
            if line.startswith('#'):
                if "#global bias W0" in line:
                    out_iter = 0
                elif "#unary interactions Wj" in line:
                    out_iter = 1
                elif "#pairwise interactions Vj,f" in line:
                    out_iter = 2
            else:
                # check context get in previous step and adds accordingly
                if out_iter == 0:
                    global_bias = float(line)
                elif out_iter == 1:
                    weights.append(float(line))
                elif out_iter == 2:
                    try:
                        pairwise_interactions.append(
                            [float(x) for x in line.split(' ')])
                    except ValueError as e:
                        pairwise_interactions.append(
                            0.0)  #Case: no pairwise interactions used

        pairwise_interactions = np.matrix(pairwise_interactions)

        # parses rlog into dataframe
        if self.__rlog:
            # parses rlog into
            import pandas as pd
            rlog_fd.seek(0)
            print os.stat(rlog_fd.name).st_size
            rlog = pd.read_csv(rlog_fd.name, sep='\t')
            rlog_fd.close()
        else:
            rlog = None

        if self.__learning_method == 'sgda' and (
                x_validation_set is not None and y_validation_set is not None):
            validation_fd.close()
        if meta is not None:
            meta_fd.close()

        # removes temporary output file after using
        train_fd.close()
        test_fd.close()
        model_fd.close()
        out_fd.close()

        # return as named collection for multiple output
        import collections
        fm = collections.namedtuple('model', [
            'predictions', 'global_bias', 'weights', 'pairwise_interactions',
            'rlog'
        ])
        return fm(preds, global_bias, weights, pairwise_interactions, rlog)
Exemplo n.º 56
0
    def test_feature_importances(self):
        data = np.random.randn(100, 5)
        target = np.array([0, 1] * 50)

        dump_svmlight_file(data, target, temp_name) 
        xgb.encrypt_file(temp_name, temp_enc_name, sym_key_file)
 
        features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5']

        dm = xgb.DMatrix({username: temp_enc_name}, feature_names=features)
        params = {'objective': 'multi:softprob',
                  'eval_metric': 'mlogloss',
                  'eta': 0.3,
                  'num_class': 3}

        bst = xgb.train(params, dm, num_boost_round=10)

        # number of feature importances should == number of features
        scores1 = bst.get_score()
        scores2 = bst.get_score(importance_type='weight')
        scores3 = bst.get_score(importance_type='cover')
        scores4 = bst.get_score(importance_type='gain')
        scores5 = bst.get_score(importance_type='total_cover')
        scores6 = bst.get_score(importance_type='total_gain')
        assert len(scores1) == len(features)
        assert len(scores2) == len(features)
        assert len(scores3) == len(features)
        assert len(scores4) == len(features)
        assert len(scores5) == len(features)
        assert len(scores6) == len(features)

        # check backwards compatibility of get_fscore
        fscores = bst.get_fscore()
        assert scores1 == fscores

        dtrain = xgb.DMatrix({username: dpath + 'agaricus.txt.train.enc'})
        dtest = xgb.DMatrix({username: dpath + 'agaricus.txt.test.enc'})

        def fn(max_depth, num_rounds):
            # train
            params = {'max_depth': max_depth, 'eta': 1, 'verbosity': 0}
            bst = xgb.train(params, dtrain, num_boost_round=num_rounds)

            # predict
            preds = bst.predict(dtest)[0]
            contribs = bst.predict(dtest, pred_contribs=True)[0]

            # result should be (number of features + BIAS) * number of rows
            assert contribs.shape == (dtest.num_row(), dtest.num_col() + 1)

            # sum of contributions should be same as predictions
            np.testing.assert_array_almost_equal(np.sum(contribs, axis=1), preds)

        # for max_depth, num_rounds in itertools.product(range(0, 3), range(1, 5)):
        #     yield fn, max_depth, num_rounds

        # check that we get the right SHAP values for a basic AND example
        # (https://arxiv.org/abs/1706.06060)
        X = np.zeros((4, 2))
        X[0, :] = 1
        X[1, 0] = 1
        X[2, 1] = 1
        y = np.zeros(4)
        y[0] = 1
        param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0}

        dump_svmlight_file(X, y, temp_name) 
        xgb.encrypt_file(temp_name, temp_enc_name, sym_key_file)

        bst = xgb.train(param, xgb.DMatrix({username: temp_enc_name}), 1)

        dump_svmlight_file(X[0:1, :], np.zeros(1), temp_name) 
        xgb.encrypt_file(temp_name, temp_enc_name, sym_key_file)

        out = bst.predict(xgb.DMatrix({username: temp_enc_name}), pred_contribs=True)[0]
        #TODO(rishabh): enable pred_contribs
        """
        assert out[0, 0] == 0.375
        assert out[0, 1] == 0.375
        assert out[0, 2] == 0.25
        """

        def parse_model(model):
            trees = []
            r_exp = r"([0-9]+):\[f([0-9]+)<([0-9\.e-]+)\] yes=([0-9]+),no=([0-9]+).*cover=([0-9e\.]+)"
            r_exp_leaf = r"([0-9]+):leaf=([0-9\.e-]+),cover=([0-9e\.]+)"
            for tree in model.get_dump(with_stats=True):
                lines = list(tree.splitlines())
                trees.append([None for i in range(len(lines))])
                for line in lines:
                    match = re.search(r_exp, line)
                    if match is not None:
                        ind = int(match.group(1))
                        while ind >= len(trees[-1]):
                            trees[-1].append(None)
                        trees[-1][ind] = {
                            "yes_ind": int(match.group(4)),
                            "no_ind": int(match.group(5)),
                            "value": None,
                            "threshold": float(match.group(3)),
                            "feature_index": int(match.group(2)),
                            "cover": float(match.group(6))
                        }
                    else:

                        match = re.search(r_exp_leaf, line)
                        ind = int(match.group(1))
                        while ind >= len(trees[-1]):
                            trees[-1].append(None)
                        trees[-1][ind] = {
                            "value": float(match.group(2)),
                            "cover": float(match.group(3))
                        }
            return trees

        def exp_value_rec(tree, z, x, i=0):
            if tree[i]["value"] is not None:
                return tree[i]["value"]
            else:
                ind = tree[i]["feature_index"]
                if z[ind] == 1:
                    if x[ind] < tree[i]["threshold"]:
                        return exp_value_rec(tree, z, x, tree[i]["yes_ind"])
                    else:
                        return exp_value_rec(tree, z, x, tree[i]["no_ind"])
                else:
                    r_yes = tree[tree[i]["yes_ind"]]["cover"] / tree[i]["cover"]
                    out = exp_value_rec(tree, z, x, tree[i]["yes_ind"])
                    val = out * r_yes

                    r_no = tree[tree[i]["no_ind"]]["cover"] / tree[i]["cover"]
                    out = exp_value_rec(tree, z, x, tree[i]["no_ind"])
                    val += out * r_no
                    return val

        def exp_value(trees, z, x):
            return np.sum([exp_value_rec(tree, z, x) for tree in trees])

        def all_subsets(ss):
            return itertools.chain(*map(lambda x: itertools.combinations(ss, x), range(0, len(ss) + 1)))

        def shap_value(trees, x, i, cond=None, cond_value=None):
            M = len(x)
            z = np.zeros(M)
            other_inds = list(set(range(M)) - set([i]))
            if cond is not None:
                other_inds = list(set(other_inds) - set([cond]))
                z[cond] = cond_value
                M -= 1
            total = 0.0

            for subset in all_subsets(other_inds):
                if len(subset) > 0:
                    z[list(subset)] = 1
                v1 = exp_value(trees, z, x)
                z[i] = 1
                v2 = exp_value(trees, z, x)
                total += (v2 - v1) / (scipy.special.binom(M - 1, len(subset)) * M)
                z[i] = 0
                z[list(subset)] = 0
            return total

        def shap_values(trees, x):
            vals = [shap_value(trees, x, i) for i in range(len(x))]
            vals.append(exp_value(trees, np.zeros(len(x)), x))
            return np.array(vals)

        def interaction_values(trees, x):
            M = len(x)
            out = np.zeros((M + 1, M + 1))
            for i in range(len(x)):
                for j in range(len(x)):
                    if i != j:
                        out[i, j] = interaction_value(trees, x, i, j) / 2
            svals = shap_values(trees, x)
            main_effects = svals - out.sum(1)
            out[np.diag_indices_from(out)] = main_effects
            return out

        def interaction_value(trees, x, i, j):
            M = len(x)
            z = np.zeros(M)
            other_inds = list(set(range(M)) - set([i, j]))

            total = 0.0
            for subset in all_subsets(other_inds):
                if len(subset) > 0:
                    z[list(subset)] = 1
                v00 = exp_value(trees, z, x)
                z[i] = 1
                v10 = exp_value(trees, z, x)
                z[j] = 1
                v11 = exp_value(trees, z, x)
                z[i] = 0
                v01 = exp_value(trees, z, x)
                z[j] = 0
                total += (v11 - v01 - v10 + v00) / (scipy.special.binom(M - 2, len(subset)) * (M - 1))
                z[list(subset)] = 0
            return total

        # test a simple and function
        M = 2
        N = 4
        X = np.zeros((N, M))
        X[0, :] = 1
        X[1, 0] = 1
        X[2, 1] = 1
        y = np.zeros(N)
        y[0] = 1
        param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0}

        #TODO(rishabh): enable pred_contribs
        """
Exemplo n.º 57
0
def combine_feat(feat_names, feat_path_name):

    print("==================================================")
    print("Combine features...")

    ######################
    ## Cross-validation ##
    ######################
    print("For cross-validation...")
    ## for each run and fold
    for run in range(1, config.n_runs + 1):
        ## use 33% for training and 67 % for validation
        ## so we switch trainInd and validInd
        for fold in range(1, config.n_folds + 1):
            print("Run: %d, Fold: %d" % (run, fold))
            path = "%s/Run%d/Fold%d" % (config.feat_folder, run, fold)
            save_path = "%s/%s/Run%d/Fold%d" % (config.feat_folder,
                                                feat_path_name, run, fold)
            if not os.path.exists(save_path):
                os.makedirs(save_path)

            for i, (feat_name, transformer) in enumerate(feat_names):

                ## load train feat
                feat_train_file = "%s/train.%s.feat.pkl" % (path, feat_name)
                with open(feat_train_file, "rb") as f:
                    x_train = cPickle.load(f)
                if len(x_train.shape) == 1:
                    x_train.shape = (x_train.shape[0], 1)

                ## load valid feat
                feat_valid_file = "%s/valid.%s.feat.pkl" % (path, feat_name)
                with open(feat_valid_file, "rb") as f:
                    x_valid = cPickle.load(f)
                if len(x_valid.shape) == 1:
                    x_valid.shape = (x_valid.shape[0], 1)

                ## align feat dim
                dim_diff = abs(x_train.shape[1] - x_valid.shape[1])
                if x_valid.shape[1] < x_train.shape[1]:
                    x_valid = hstack(
                        [x_valid,
                         np.zeros((x_valid.shape[0], dim_diff))]).tocsr()
                elif x_valid.shape[1] > x_train.shape[1]:
                    x_train = hstack(
                        [x_train,
                         np.zeros((x_train.shape[0], dim_diff))]).tocsr()

                ## apply transformation
                x_train = transformer.fit_transform(x_train)
                x_valid = transformer.transform(x_valid)

                ## stack feat
                if i == 0:
                    X_train, X_valid = x_train, x_valid
                else:
                    try:
                        X_train, X_valid = hstack([X_train, x_train]), hstack(
                            [X_valid, x_valid])
                    except:
                        X_train, X_valid = np.hstack(
                            [X_train, x_train]), np.hstack([X_valid, x_valid])

                print("Combine {:>2}/{:>2} feat: {} ({}D)".format(
                    i + 1, len(feat_names), feat_name, x_train.shape[1]))
            print("Feat dim: {}D".format(X_train.shape[1]))

            ## load label
            # train
            info_train = pd.read_csv("%s/train.info" % (save_path))
            ## change it to zero-based for multi-classification in xgboost
            Y_train = info_train["median_relevance"] - 1
            # valid
            info_valid = pd.read_csv("%s/valid.info" % (save_path))
            Y_valid = info_valid["median_relevance"] - 1

            ## dump feat
            dump_svmlight_file(X_train, Y_train, "%s/train.feat" % (save_path))
            dump_svmlight_file(X_valid, Y_valid, "%s/valid.feat" % (save_path))

    ##########################
    ## Training and Testing ##
    ##########################
    print("For training and testing...")
    path = "%s/All" % (config.feat_folder)
    save_path = "%s/%s/All" % (config.feat_folder, feat_path_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    for i, (feat_name, transformer) in enumerate(feat_names):

        ## load train feat
        feat_train_file = "%s/train.%s.feat.pkl" % (path, feat_name)
        with open(feat_train_file, "rb") as f:
            x_train = cPickle.load(f)
        if len(x_train.shape) == 1:
            x_train.shape = (x_train.shape[0], 1)

        ## load test feat
        feat_test_file = "%s/test.%s.feat.pkl" % (path, feat_name)
        with open(feat_test_file, "rb") as f:
            x_test = cPickle.load(f)
        if len(x_test.shape) == 1:
            x_test.shape = (x_test.shape[0], 1)

        ## align feat dim
        dim_diff = abs(x_train.shape[1] - x_test.shape[1])
        if x_test.shape[1] < x_train.shape[1]:
            x_test = hstack([x_test,
                             np.zeros((x_test.shape[0], dim_diff))]).tocsr()
        elif x_test.shape[1] > x_train.shape[1]:
            x_train = hstack([x_train,
                              np.zeros((x_train.shape[0], dim_diff))]).tocsr()

        ## apply transformation
        x_train = transformer.fit_transform(x_train)
        x_test = transformer.transform(x_test)

        ## stack feat
        if i == 0:
            X_train, X_test = x_train, x_test
        else:
            try:
                X_train, X_test = hstack([X_train,
                                          x_train]), hstack([X_test, x_test])
            except:
                X_train, X_test = np.hstack([X_train, x_train
                                             ]), np.hstack([X_test, x_test])

        print("Combine {:>2}/{:>2} feat: {} ({}D)".format(
            i + 1, len(feat_names), feat_name, x_train.shape[1]))
    print("Feat dim: {}D".format(X_train.shape[1]))

    ## load label
    # train
    info_train = pd.read_csv("%s/train.info" % (save_path))
    ## change it to zero-based for multi-classification in xgboost
    Y_train = info_train["median_relevance"] - 1
    # test
    info_test = pd.read_csv("%s/test.info" % (save_path))
    Y_test = info_test["median_relevance"] - 1

    ## dump feat
    dump_svmlight_file(X_train, Y_train, "%s/train.feat" % (save_path))
    dump_svmlight_file(X_test, Y_test, "%s/test.feat" % (save_path))
Exemplo n.º 58
0
def test_dump():
    X_sparse, y_dense = load_svmlight_file(datafile)
    X_dense = X_sparse.toarray()
    y_sparse = sp.csr_matrix(y_dense)

    # slicing a csr_matrix can unsort its .indices, so test that we sort
    # those correctly
    X_sliced = X_sparse[np.arange(X_sparse.shape[0])]
    y_sliced = y_sparse[np.arange(y_sparse.shape[0])]

    for X in (X_sparse, X_dense, X_sliced):
        for y in (y_sparse, y_dense, y_sliced):
            for zero_based in (True, False):
                for dtype in [np.float32, np.float64, np.int32, np.int64]:
                    f = BytesIO()
                    # we need to pass a comment to get the version info in;
                    # LibSVM doesn't grok comments so they're not put in by
                    # default anymore.

                    if (sp.issparse(y) and y.shape[0] == 1):
                        # make sure y's shape is: (n_samples, n_labels)
                        # when it is sparse
                        y = y.T

                    # Note: with dtype=np.int32 we are performing unsafe casts,
                    # where X.astype(dtype) overflows. The result is
                    # then platform dependent and X_dense.astype(dtype) may be
                    # different from X_sparse.astype(dtype).asarray().
                    X_input = X.astype(dtype)

                    dump_svmlight_file(X_input, y, f, comment="test",
                                       zero_based=zero_based)
                    f.seek(0)

                    comment = f.readline()
                    comment = str(comment, "utf-8")

                    assert "scikit-learn %s" % sklearn.__version__ in comment

                    comment = f.readline()
                    comment = str(comment, "utf-8")

                    assert ["one", "zero"][zero_based] + "-based" in comment

                    X2, y2 = load_svmlight_file(f, dtype=dtype,
                                                zero_based=zero_based)
                    assert X2.dtype == dtype
                    assert_array_equal(X2.sorted_indices().indices, X2.indices)

                    X2_dense = X2.toarray()
                    if sp.issparse(X_input):
                        X_input_dense = X_input.toarray()
                    else:
                        X_input_dense = X_input

                    if dtype == np.float32:
                        # allow a rounding error at the last decimal place
                        assert_array_almost_equal(
                            X_input_dense, X2_dense, 4)
                        assert_array_almost_equal(
                            y_dense.astype(dtype, copy=False), y2, 4)
                    else:
                        # allow a rounding error at the last decimal place
                        assert_array_almost_equal(
                            X_input_dense, X2_dense, 15)
                        assert_array_almost_equal(
                            y_dense.astype(dtype, copy=False), y2, 15)
Exemplo n.º 59
0
from sklearn.datasets import make_regression
from sklearn.datasets import dump_svmlight_file
import numpy as np

X, y = make_regression(
    **{
        'n_samples': 1000000,
        'n_features': 50,
        'n_informative': 4,
        'n_targets': 1,
        'random_state': 37
    })

output = 'regression.binary'
# data = dump_svmlight_file(X, y, output)
data = dump_svmlight_file(X, y, output, zero_based=False)
print(data)
Exemplo n.º 60
0
def create_svm_file(df, features_X, path):
    """
    Convert a pandas DF into a lib-SVM format!
	
	Args:
		- df (pd DF) : Pandas DF - needs to have the columns, that are passed
		               in the features_X argument!
					   
	    - features_X (list) : list of strings with the names of the attributes
	                          in df we want to use as feature!
	  
	    - path (string)     : path (incl. DF_Name) to save the lib-SVM at!
		                     [on top of "data/processed/Ranking/tf_ranking/"]
							 e.g. "CV/1.txt"
		
	Return: 
		- save the LIB-SVM DF
		- return the true responses of the passed df 
		  (used to calc the F1-Score later on!)
    """
    # [1] Check Input:
    # 	- all feature names in df, so we can select the corresponding cols?
    for _feat in features_X:
        if _feat not in df.columns.values:
            raise ValueError(_feat + "is not a columname in df")

    # 	- features_X must start with "transport_mode"
#     (needed to create valid predicitons)
    if features_X[0] != "transport_mode":
        raise ValueError("'features_X' must start with 'transport_mode'")

# 	- is the path, already defined?
#     split folder and file name and check for the existence of the folder
    folder = path.split("/")
    folder = folder[:(len(folder) - 1)]
    folder = "".join(folder)
    if not os.path.isdir("data/processed/Ranking/tf_ranking/" + str(folder)):
        raise ValueError(
            str(folder) +
            " is not existent in 'data/processed/Ranking/tf_ranking/'")

# [2] Clean the DF and get it ready!
#	- Sort the SIDs
    df.sort_values("sid", inplace=True)
    #	- drop rows, that have the same trans_mode multiple times for a single sid!
    df = df.drop_duplicates(['sid', 'transport_mode'], keep='first')

    # [3] Create ranking target
    # 	- if click_mode we mark the target with "1" and the irrelevant ones as "0"
    if 'click_mode' in df.columns:
        print("Build LTR labels")
        # T1 for target <--> 0 else
        df = df.assign(target=df.apply(
            lambda x: 1 if x.click_mode == x.transport_mode else 0, axis=1))
    else:
        # If test set every entry gets zeri for a label
        print("Assign label 0 for test set")
        df = df.assign(target=0)

# [4] Spit the DF into Target & Feature + extract the SIDs
# 	- we pass these to svm-converter to create a libsvm DF!
    X = df[features_X]
    y = df["target"]
    query_id = df.sid
    path = "data/processed/Ranking/tf_ranking/" + str(path)

    # [5] Save the SVM_File on top of: "data/processed/Ranking/tf_ranking"
    print("Dump file")
    dump_svmlight_file(X=X, y=y, f=path, query_id=query_id, zero_based=False)

    # [6] Return the Values of the true click_modes [needed for metrics!]
    return np.array(df.drop_duplicates(["sid"]).click_mode)