Пример #1
0
def test_binarizer():
    X_ = np.array([[1, 0, 5], [2, 3, 0]])

    for init in (np.array, sp.csr_matrix):

        X = init(X_.copy())

        binarizer = Binarizer(threshold=2.0, copy=True)
        X_bin = toarray(binarizer.transform(X))
        assert_equal(np.sum(X_bin == 0), 4)
        assert_equal(np.sum(X_bin == 1), 2)

        binarizer = Binarizer(copy=True).fit(X)
        X_bin = toarray(binarizer.transform(X))
        assert X_bin is not X
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=True)
        X_bin = binarizer.transform(X)
        assert X_bin is not X
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=False)
        X_bin = binarizer.transform(X)
        assert X_bin is X
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)
def predict_bay(df_test_input, param_1, param_2):

    df_train_input = param_1
    df_train_output = param_2

    # Binarization
    transformer = Binarizer().fit(df_train_input)
    df_train_input_ = pd.DataFrame(transformer.transform(df_train_input))
    transformer = Binarizer().fit(df_test_input)
    df_test_input_ = pd.DataFrame(transformer.transform(df_test_input))

    # PCA
    # Choose the number of components by our own
    number_principal_components = 100
    pca = PCA(n_components=number_principal_components)
    pca.fit(df_train_input_)
    principal_components_train = pca.transform(df_train_input_)
    # calculate the PCs for the test data as well
    principal_components_test = pca.transform(df_test_input_)

    # making the data non-negative
    lowest_num = 0
    if (principal_components_test.min() < principal_components_train.min()):
        lowest_num = principal_components_test.min()
    else:
        lowest_num = principal_components_train.min()
    principal_components_train = abs(lowest_num) + principal_components_train
    principal_components_test = abs(lowest_num) + principal_components_test

    # Bayes
    bayes = GaussianNB()
    bayes.fit(principal_components_train, df_train_output['class'].values)
    bayes_labels_pred = pd.DataFrame(bayes.predict(principal_components_test))
    return bayes_labels_pred, number_principal_components
def use_Binarizer():
    x = [[1., -1, 2.], [2., 0., 0.], [0., 1., -1.]]
    scaler = Binarizer()
    scaler.fit(x)  # 필요없음.
    print(scaler.transform(x))

    scaler = Binarizer(threshold=1.5)
    print(scaler.transform(x))

    # Binarizer 단순 버전
    print(preprocessing.binarize(x))
Пример #4
0
def ExtractWordFeaturesWithDataframes(train_dataset_df,
                                      test_dataset_df,
                                      vectorizer_type="CountVectorizer",
                                      ngrams=None,
                                      balance_dataset=False,
                                      remove_center_interval=None):
    # Main logic of the method ExtractWordFeatures.

    (train_speeches, Y_train) = extractTextsAndLabelsFromDf(
        train_dataset_df,
        balance_dataset=balance_dataset,
        remove_center_interval=remove_center_interval)
    (test_speeches, Y_test) = extractTextsAndLabelsFromDf(
        test_dataset_df,
        balance_dataset=balance_dataset,
        remove_center_interval=remove_center_interval)

    if vectorizer_type == "CountVectorizer":
        if ngrams != None:
            vectorizer = CountVectorizer(stop_words='english',
                                         token_pattern=r'[a-zA-Z]+',
                                         ngram_range=(1, ngrams))
        else:
            vectorizer = CountVectorizer(stop_words='english',
                                         token_pattern=r'[a-zA-Z]+')

    if vectorizer_type == "HashingVectorizer":
        vectorizer = CountVectorizer(stop_words='english',
                                     token_pattern=r'[a-zA-Z]+')

    if vectorizer_type == "TfidfVectorizer":
        if ngrams != None:
            vectorizer = TfidfVectorizer(stop_words='english',
                                         token_pattern=r'[a-zA-Z]+',
                                         ngram_range=(1, ngrams))
        else:
            vectorizer = TfidfVectorizer(stop_words='english',
                                         token_pattern=r'[a-zA-Z]+')

    X_train = vectorizer.fit_transform(train_speeches)

    X_test = vectorizer.transform(test_speeches)

    if vectorizer_type == "HashingVectorizer":
        transformer = Binarizer().fit(X_train)
        X_train = transformer.transform(X_train)
        transformer = Binarizer().fit(X_test)
        X_test = transformer.transform(X_test)

    feature_names = vectorizer.get_feature_names()

    return X_train, Y_train, X_test, Y_test, vectorizer, feature_names
Пример #5
0
def test_binarizer():
    b = Binarizer(np.mean(X))
    inputs = ['x{0}'.format(i + 1) for i in range(X.shape[1])]
    expr = skompile(b.transform, inputs)
    assert np.all(
        b.transform(X) == np.asarray(
            [expr.evaluate(x1=x[0], x2=x[1], x3=x[2], x4=x[3]) for x in X]))
Пример #6
0
def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
    # thresholds list객체내의 값을 차례로 iteration하면서 Evaluation 수행.
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임곗값:', custom_threshold)
        get_clf_eval(y_test, custom_predict, pred_proba_c1)
Пример #7
0
def cv_mean_std_array(X, y, alphas, ks, n_a, n_k, cv=20):
    n = n_alphas*n_ks
    cv_mean = np.empty(n)
    cv_std = np.empty(n)
    regressors = pd.DataFrame()

    binarizer = Binarizer(threshold=1400)
    y_binary = binarizer.transform(y).transpose().ravel() 

    itt_counter = 0
    print 'size n_a: %d n_k: %d' %(n_a, n_k)
    for i in range (0, n_a):
    	print 'reg. column : %d' %(i*n_k)
    	temp_string = 'alpha=%f' %alphas[i*n_k]
    	print temp_string
    	print regressors.shape
    	df_temp = pd.DataFrame()
        print 'computing for alpha = %f' %(alphas[n_ks*i])
        X_lasso, df_temp[temp_string] = df_Lasso(X, y, alphas[i*n_k])
        regressors = pd.concat([regressors,df_temp], ignore_index=True, axis=1)
        for j in range(0, n_k):
            print 'i:%d, j:%d' %(i, j)
            print 'computing for alpha = %f and k = %f' %(alphas[n_ks*i+j], ks[n_ks*i+j])
            print 'X_lasso shape:' 
            print X_lasso.shape
            cv_mean[n_ks*i+j], cv_std[n_ks*i+j] = knn_cv_mean_and_std(X_lasso, y_binary, alphas[n_ks*i+j], ks[n_ks*i+j], cv=cv)
            itt_counter = itt_counter + 1
            print 'completed %dth iteration of knn cv mean:%f std:%f, at pos:%d' % (itt_counter, cv_mean[n_ks*i+j], cv_std[n_ks*i+j], n_ks*i+j)
    return cv_mean, cv_std, regressors
Пример #8
0
    def get_feature_vectors(self, emails_bodies):
        #create a vectoriser
        vectorizer = TfidfVectorizer(analyzer='word',
                                     strip_accents=None,
                                     ngram_range=(1, 1),
                                     max_features=self.max_features,
                                     stop_words='english',
                                     norm=None)
        #train it on the emails body
        vectorizer = vectorizer.fit(emails_bodies)
        #transform the raw emails body into feature vectors
        features_vectors = vectorizer.transform(
            tqdm(emails_bodies, desc=" Creating emails feature vector"))
        #created a binarizer that turns the TF-IDF features into binary feature vectors
        # (0 for non occurance and 1 for occurance)
        binarizer = Binarizer().fit(features_vectors)
        #needed for good word attack
        features_bin = binarizer.transform(features_vectors)

        #get the feature names, vocabulary and weights
        feature_names = vectorizer.get_feature_names()
        features_with_indices = vectorizer.vocabulary_
        features_weights = vectorizer.idf_

        return features_vectors, feature_names, features_with_indices, features_weights, features_bin
Пример #9
0
def myxgb(x_train, x_test, y_train, y_test):

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 0.1,
        'max_depth': 8,
        'alpha': 0,
        'lambda': 0,
        'subsample': 0.7,
        'colsample_bytree': 0.5,
        'min_child_weight': 3,
        'silent': 0,
        'eta': 0.03,
        'nthread': -1,
        'seed': 2019,
    }
    num_round = 180
    dtrain = xgb.DMatrix(x_train, y_train)
    bst = xgb.train(params, dtrain, num_round)
    #pickle.dump(bst,open("xgboostclass2.dat","wb"))
    dtest = xgb.DMatrix(x_test, y_test)
    #loaded_model = pickle.load(open("xgboostclass2.dat","rb"))
    ypreds = bst.predict(dtest)
    #print(y_test)
    #print(ypreds)
    bn = Binarizer(threshold=0.42444044)
    ypreds = bn.transform(ypreds.reshape(-1, 1))
    print("myxgb精度为:", accuracy_score(y_test, ypreds))
Пример #10
0
    def rescaleData(cls):
        dataframe = read_csv(cls.filename, names=cls.names)
        array = dataframe.values

        #separate array into inpput and output components
        X = array[:, 0:8]
        y = array[:, 8]

        print("\nRescaled with MinMaxScaler")
        scaler = MinMaxScaler(feature_range=(0, 1))
        rescaledX = scaler.fit_transform(X)
        #summarize transformed data
        set_printoptions(precision=3)
        print(rescaledX[0:5, :])

        print("\nRescaled with StandardScaler")
        scaler = StandardScaler().fit(X)
        rescaledX = scaler.transform(X)
        #summarize transformed data
        set_printoptions(precision=3)
        print(rescaledX[0:5, :])

        print("\nRescaled with Normalizer")
        scaler = Normalizer().fit(X)
        normalizedX = scaler.transform(X)
        #summarize trasnformed data
        set_printoptions(precision=3)
        print(normalizedX[0:5, :])

        binarizer = Binarizer(threshold=0.0).fit(X)
        binaryX = binarizer.transform(X)
        #summarize transformed data
        set_printoptions(precision=3)
        print(binaryX[0:5, :])
Пример #11
0
def binarize_data():
    from sklearn.preprocessing import Binarizer
    array = load_data()
    x = array[:, 0:8]
    y = array[:, 8]
    binarizer = Binarizer(threshold=0.0).fit(x)
    binaryx = binarizer.transform(x)
    return binaryx, binaryx[0:5, :]
Пример #12
0
 def binarizer(df):
     """
         根据阈值对数据进行二值化(将特征值设置为0或1)
     """
     X = df.values
     transformer = Binarizer().fit(X)  # fit does nothing.
     matrix = transformer.transform(X)
     return matrix
Пример #13
0
def evalate(threshhold,meridian,result_data,true_data ):
    meridian_names = ['LUNG', 'SPLEEN', 'STOMACH', 'HEART', 'KIDNEY', 'LIVER', 'LARGE INTESTINE']
    binarizer = Binarizer(threshhold).fit(result_data)
    result_binary = pd.DataFrame(binarizer.transform(result_data),columns=meridian_names)
    eva_scores = list(map(lambda x: x(true_data[meridian], result_binary[meridian]),
                          list_of_functions))
    auc_score = roc_auc_score(true_data[meridian],result_data[meridian] )
    eva_scores.append(auc_score)
    return eva_scores
Пример #14
0
def get_eval_by_threshold(y_test, pred_proba_c1, threshold):

    # threshold list 객체 내의 값을 차례로 iteration 하면서 Evaluation 수행
    for custom_th in threshold: 
        binarizer = Binarizer(threshold = custom_th).fit(pred_proba_c1)
        custom_pred = binarizer.transform(pred_proba_c1)
        
        
        print('임계값 :', custom_th)
        get_clf_eval(y_test, custom_pred)
Пример #15
0
def cv_mean_std_array(X, y, alphas, n_a, cv=20):
    binarizer = Binarizer(threshold=1400)
    y_binary = binarizer.transform(y).transpose().ravel() 
    cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds = np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a)
    
    for i in range (0, n_a):
    	print 'computing for alpha=%f' %alphas[i]
        cv_ols_means[i], cv_ols_stds[i], cv_lasso_means[i], cv_lasso_stds[i], cv_ridge_means[i], cv_ridge_stds[i] = lm_cv_mean_and_std(X, , alphas[i])
        print 'successfully computed iteration %d' %i
    return cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds
Пример #16
0
def initialize():
    images, labels = load_mnist_data()

    binarizer = Binarizer().fit(images)
    images_binarized = binarizer.transform(images)

    knn = KNeighborsClassifier(n_neighbors=3, metric='jaccard')
    knn.fit(images_binarized, labels)

    return knn
Пример #17
0
    def convert_bin(self, feat):
        feat_bin = Binarizer(threshold=0.0).fit(feat.reshape(-1, 1))
        feat_bin = feat_bin.transform(feat.reshape(-1, 1)).squeeze()
        feat_bin = ''.join(
            [bin(int(i)).replace('0b', '') for i in list(feat_bin)])

        # binary/16bit---->int
        feat_bin = re.findall(r'.{16}', feat_bin)
        feat_int = np.array([int(i, base=2) for i in feat_bin])
        return feat_int
Пример #18
0
def run_binarizer():

    x = [[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9], [1, 7, 2, 6, 2, 7, 2],
         [3, 8, 6, 2, 8, 3, 8]]

    print(x)

    binarizer = Binarizer(threshold=4)
    print(binarizer.transform(x))

    pass
Пример #19
0
def test_binarizer():
    x = [
        [1, 2, 3, 4, 5],
        [5, 4, 3, 2, 1],
        [3, 3, 3, 3, 3],
        [1, 1, 1, 1, 1]
    ]
    from sklearn.preprocessing import Binarizer
    print("before transform:", x)
    binarizer = Binarizer(threshold=2.5)  # threshold参数指定了属性的阈值
    print("after transform:", binarizer.transform(x))
Пример #20
0
def main():
    PATH = "../pima-indians-diabetes.data.csv"
    columns = [
        'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
    ]
    df = read_csv(PATH, names=columns)
    array = df.values
    X = array[:, 0:8]
    Y = array[:, 8]
    binarizer = Binarizer(threshold=0.0).fit(X)
    binaryX = binarizer.transform(X)
    set_printoptions(precision=3)
    print(binaryX[0:5, :])
    def test_onnxrt_python_Binarizer(self):
        iris = load_iris()
        X, y = iris.data, iris.target
        X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
        clr = Binarizer()
        clr.fit(X_train, y_train)

        model_def = to_onnx(clr, X_train.astype(numpy.float32))
        oinf = OnnxInference(model_def)
        got = oinf.run({'X': X_test})
        self.assertEqual(list(sorted(got)), ['variable'])
        exp = clr.transform(X_test)
        self.assertEqualArray(exp, got['variable'], decimal=6)
Пример #22
0
def test_binarizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.Binarizer
    # with sklearn.preprocessing.Binarizer

    binarizerr = BinarizerR()
    binarizerr.fit(np.concatenate(trajs))

    binarizer = Binarizer()
    binarizer.fit(trajs)

    y_ref1 = binarizerr.transform(trajs[0])
    y1 = binarizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Пример #23
0
def test_binarizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.Binarizer
    # with sklearn.preprocessing.Binarizer

    binarizerr = BinarizerR()
    binarizerr.fit(np.concatenate(trajs))

    binarizer = Binarizer()
    binarizer.fit(trajs)

    y_ref1 = binarizerr.transform(trajs[0])
    y1 = binarizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Пример #24
0
def informationGain(texts, labels, nFeatures = 10000):
    vectorizer = CountVectorizer(token_pattern = '[a-zA-Z]+', stop_words='english')
    bow = vectorizer.fit_transform(texts)
    transformer = Binarizer().fit(bow)
    bow = transformer.transform(bow)
    names = vectorizer.get_feature_names()
    
    if nFeatures != -1:
        pos_train = []
        neg_train = []
        for i in range(0,len(labels_train)):
            if labels_train[i] == -1.0:
                neg_train.append(i)
            else:
                pos_train.append(i)
                
        pos_matrix = bow.tocsr()[pos_train,:]
        neg_matrix = bow.tocsr()[neg_train,:]
        diff = [abs(x - y) for x,y in zip(pos_matrix.mean(axis = 0).tolist()[0], neg_matrix.mean(axis = 0).tolist()[0])]
       
        indexes = []
        
        indexes_sorted = [i[0] for i in sorted(enumerate(diff), key=lambda x:x[1])]
        names_sorted = [names[i] for i in indexes_sorted]
        
        indexes = indexes_sorted[len(indexes_sorted)-nFeatures:len(indexes_sorted)]
        names = names_sorted[len(indexes_sorted)-nFeatures:len(indexes_sorted)]
        bow = bow.tocsr()[:,indexes]
    
    info_gain = {}
    
    labels_entropy = entropy(labels)
    count = 0
    for w in names:
        count += 1
        if count%500 == 0:
            print(count/bow.shape[1]*100)
        texts_with_w_labels = []
        texts_without_w_labels = []
        index = names.index(w)
        column = bow[:,index]
        
        with_indices = find(column)[0].tolist()
        texts_with_w_labels = [labels[i] for i in list(range(0,len(labels))) if i in with_indices ]
        texts_without_w_labels = [labels[i] for i in list(range(0,len(labels))) if i not in with_indices ]
        info_gain_w = labels_entropy - (float(len(texts_with_w_labels))/float(len(labels))) * entropy(texts_with_w_labels) -(float(len(texts_without_w_labels))/float(len(labels))) * entropy(texts_without_w_labels)
    
        info_gain[w] = info_gain_w
        
    return info_gain
Пример #25
0
class BinarizerImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
def Binarize_Dataset():
    s = start_date()
    e = end_date()
    sym = input_symbol()
    df = yf.download(sym, s, e)
    array = df.values
    X = array[:, 0:5]
    Y = array[:, 5]
    # initialising the binarize
    binarizer = Binarizer(threshold=0.0).fit(X)
    binaryX = binarizer.transform(X)
    np.set_printoptions(precision=3)
    print(
        'Binarize values equal or less than 0 are marked 0 and all of those above 0 are marked 1'
    )
    print(binaryX[0:5, :])
    print("")
    # Splitting the datasets into training sets and Test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=0)
    sc_X = StandardScaler()
    # Splitting the datasets into training sets and Test sets
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.fit_transform(X_test)
    print("Training Dataset")
    print(X_train)
    print("")
    print(Y_train)
    print("")
    print("Testing Dataset")
    print(X_test)
    print("")
    print(Y_test)
    print("")
    ans = ['1', '2']
    user_input = input("""                  
What would you like to do next? Enter option 1 or 2.  
1. Menu
2. Exit
Command: """)
    while user_input not in ans:
        print("Error: Please enter a a valid option 1-2")
        user_input = input("Command: ")
    if user_input == "1":
        menu()
    elif user_input == "2":
        exit()
Пример #27
0
def test_Binarizer():
    '''
    test Binatizer method
    :return: None
    '''
    X = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [
        3,
        3,
        3,
        3,
        3,
    ], [1, 1, 1, 1, 1]]
    print("before transform:", X)
    binarizer = Binarizer(threshold=2.5)
    print("after transform:", binarizer.transform(X))
Пример #28
0
def test_Binarizer():
    '''
    测试 Binarizer 的用法
    :return: None
    '''
    X = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [
        3,
        3,
        3,
        3,
        3,
    ], [1, 1, 1, 1, 1]]
    print("before transform:", X)
    binarizer = Binarizer(threshold=2.5)  #threshold:阈值设定,高于阈值为1,低于阈值为0
    print("after transform:", binarizer.transform(X))
def create_target(data, threshold=0.0):
    '''
    Create target variable that is binary {0,1}.
    Split into X and y.
    data: dataframe
    '''

    data1 = data.dropna().copy()
    binarizer = Binarizer(threshold=threshold)
    target = binarizer.transform(data1[('Returns','Next_Month')].values.reshape(-1,1))

    data1 = data1.join(pd.DataFrame(target,
        columns=pd.MultiIndex.from_product([['Returns'], ['Target']]),
        index=data1.index))

    return data1
Пример #30
0
def binarize():
    for key, value in userScript.userDefinedBinarizeColumns.items():
        if dataType.dataType(key, df) != "str":
            #user defined threshold
            userThreshold = value[0]
            col = key

            binarizeColumn = df.filter([col], axis=1)
            df = df.drop(col, axis=1)

            array = binarizeColumn.values
            binarizer = Binarizer(threshold=userThreshold).fit(array)
            binary = binarizer.transform(array)

            df[col] = binary
        else:
            print("The column, ", col, "is of type: string. Cannot binarize")
Пример #31
0
class BinarizerTransformer(NumericTransformer):
    def __init__(self, column_id, threshold=0.0):
        NumericTransformer.__init__(self, column_id, "binary", 1)
        self.threshold = threshold
        self.model = Binarizer(self.threshold)

    def transform1(self, column_data):
        where_are_NaNs = np.isnan(column_data)
        column_data[where_are_NaNs] = -1

        return np.matrix(self.model.transform(column_data.reshape(1, -1))).T

    def transform(self, dataset, ids):
        column_data = np.array(dataset.values[ids, self.column_id],
                               dtype=np.float64)

        return self.transform1(column_data)
    def test_binarizer_converter(self):
        data = np.array([[1, 2, -3], [4, -3, 0], [0, 1, 4], [0, -5, 6]],
                        dtype=np.float32)
        data_tensor = torch.from_numpy(data)

        for threshold in [0.0, 1.0, -2.0]:
            model = Binarizer(threshold=threshold)
            model.fit(data)

            torch_model = hummingbird.ml.convert(model, "torch")
            self.assertIsNotNone(torch_model)
            np.testing.assert_allclose(
                model.transform(data),
                torch_model.transform(data_tensor),
                rtol=1e-06,
                atol=1e-06,
            )
    def test_default(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = BinarizerComponent()
        config = self.get_default(actual)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        X_actual = actual.transform(np.copy(X_test))

        expected = Binarizer()
        expected.fit(X_train, y_train)
        X_expected = expected.transform(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == feature_names
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(X_actual, X_expected)
Пример #34
0
Файл: fs.py Проект: Roche/AMASC
def ge_transform(df_GE, genes):
    scaler = MinMaxScaler()
    print(len(df_GE))
    binarizer = Binarizer(threshold=threshold_binarize)
    df_features = df_GE.transpose()
    print(len(df_features))
    df_features = df_features.groupby(df_features.columns, axis=1).agg(max)

    df_features = df_features[genes]
    scaler.fit(df_features)
    df_features = scaler.transform(df_features)
    binarizer.fit(df_features)
    df_features = binarizer.transform(df_features)
    print(len(df_features))
    df_features = pd.DataFrame(df_features)

    df_features.columns = genes
    return df_features
Пример #35
0
def test_binarizer():
    X_ = np.array([[1, 0, 5], [2, 3, -1]])

    for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix):

        X = init(X_.copy())

        binarizer = Binarizer(threshold=2.0, copy=True)
        X_bin = toarray(binarizer.transform(X))
        assert_equal(np.sum(X_bin == 0), 4)
        assert_equal(np.sum(X_bin == 1), 2)
        X_bin = binarizer.transform(X)
        assert_equal(sparse.issparse(X), sparse.issparse(X_bin))

        binarizer = Binarizer(copy=True).fit(X)
        X_bin = toarray(binarizer.transform(X))
        assert_true(X_bin is not X)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=True)
        X_bin = binarizer.transform(X)
        assert_true(X_bin is not X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

        binarizer = Binarizer(copy=False)
        X_bin = binarizer.transform(X)
        if init is not list:
            assert_true(X_bin is X)
        X_bin = toarray(X_bin)
        assert_equal(np.sum(X_bin == 0), 2)
        assert_equal(np.sum(X_bin == 1), 4)

    binarizer = Binarizer(threshold=-0.5, copy=True)
    for init in (np.array, list):
        X = init(X_.copy())

        X_bin = toarray(binarizer.transform(X))
        assert_equal(np.sum(X_bin == 0), 1)
        assert_equal(np.sum(X_bin == 1), 5)
        X_bin = binarizer.transform(X)

    # Cannot use threshold < 0 for sparse
    assert_raises(ValueError, binarizer.transform, sparse.csc_matrix(X))
Пример #36
0
                              ('if|',InteractionFeatures(method = lambda x,y:(y/x), threshold = corr_thresh,subsample = 1,logger=logger))
                               ])
                             
pp_pipeline = Pipeline([
                        ('removedupes',RemoveDuplicateCols(logger=logger)),
                        ('featureextraction',featureunion1),
                        ('bounder',Bounder(inf,-inf))
                        ])

#%%

idvs_raw = numpy.load(datafilename + ".npy")

dvs = numpy.load(datafilename + "_dvs.npy")

dvs_binary = binarizer.transform(dvs).reshape((dvs.shape[0],))

idvs = pp_pipeline.fit_transform(idvs,dvs_binary)

logger.debug("Building models with %s idvs", idvs.shape[1])

#%% Loss models

#corrs = numpy.array([numpy.abs(numpy.corrcoef(dvs_binary.T,idvs[:,i])[0,1]) for i in xrange(idvs.shape[1])])
#corrs2 = numpy.array([numpy.abs(numpy.corrcoef(dvs_binary.T,idvs2[:,i])[0,1]) for i in xrange(idvs2.shape[1])])

#idvs3 = numpy.hstack((idvs[:,numpy.where(corrs>0.145)[0]],idvs2[:,numpy.where(corrs2>0.11)[0]],))

#print idvs3.shape

idvs = Bounder(inf,-inf).transform(idvs)
# # Binarization

# In[6]:

watched = np.array(popsong_df['listen_count']) 
watched[watched >= 1] = 1
popsong_df['watched'] = watched
popsong_df.head(10)


# In[7]:

from sklearn.preprocessing import Binarizer

bn = Binarizer(threshold=0.9)
pd_watched = bn.transform([popsong_df['listen_count']])[0]
popsong_df['pd_watched'] = pd_watched
popsong_df.head(11)


# # Rounding

# In[8]:

items_popularity = pd.read_csv('datasets/item_popularity.csv', encoding='utf-8')
items_popularity


# In[9]:

items_popularity['popularity_scale_10'] = np.array(np.round((items_popularity['pop_percent'] * 10)), dtype='int')
varSizeStatisticsTrain = zeros(numCombinations, dtype=float)
varSizeStatisticsTest = zeros(numCombinations, dtype=float)

a = 0

mnist = fetch_mldata('MNIST original')

# split a training set and a test set
y_train, y_test = mnist.target[:60000], mnist.target[60000:70000]

#vectorizer = CountVectorizer(binary=True)
X_both = mnist.data

binarizer = Binarizer().fit(50,X_both)
X_both = binarizer.transform(X_both)

X_train = X_both[:60000]
X_test =  X_both[60000:70000]

#print X_train[1]

#ch2 = SelectKBest(chi2, 750)
#X_train = ch2.fit_transform(X_train, y_train)
#X_test = ch2.transform(X_test)

data_train = X_train
m,n = data_train.shape

print m," ",n
X = (news_data * lasso_est.transpose()) # multiply element wise with lasso estimate
df_Lasso = X[X.columns[(X != 0).any()]] # remove columns where all elements are zero
print df_Lasso.shape # number of columns should significantly shrink depending on choice of alpha
df_Lasso.columns.values.tolist()


# In[104]:

#obtain a split
# from sklearn.cross_validation import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(df_Lasso, news_labels)

#binarize
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=binary_threshold)
binary_labels = binarizer.transform(news_labels).transpose().ravel()     # .ravel() is to fix "Too many array indices error"
print binary_labels.shape


# In[107]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score

knn = KNeighborsClassifier(n_neighbors=1) # arbitrary k
cv = cross_val_score(knn, df_Lasso, binary_labels, cv=10)
print "Cross Validation Scores"
print cv
print 'Mean Cross Validation Score'
print np.mean(cv)
Пример #40
0
# binarization
from sklearn.preprocessing import Binarizer
import pandas
import numpy
url = "https://goo.gl/vhm1eU"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
# summarize transformed data
numpy.set_printoptions(precision=3)
print(binaryX[0:5,:])
Пример #41
0
#	Comment section below out if you already have made pickle files
#
#---------------------------------------------------------------------------------------

all_bigr = ngram(X_train, 'bigram') #starting with all features

print "Starting counting bigrams..."
X_train_bi_counted = count(X_train, all_bigr, 'bigram')
print "Done counting train set"
X_test_bi_counted = count(X_test, all_bigr, 'bigram')
print "Done counting test set"

print "Binarizing and dumping files"
bin = Binarizer()
X_train_bi_binary = bin.fit_transform(X_train_bi_counted)
X_test_bi_binary = bin.transform(X_test_bi_counted)
pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) )
pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) )
print "Done"


print "Starting tfidf vectors..."
X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted)
pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) )
pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) )
print "Done"


print "Starting feature selection using CART random forests on binary files"
indices_important_feats_bi_bin = tree(X_train_bi_binary, y_train, all_bigr, 'Bigram_binary')
pickle.dump(indices_important_feats_bi_bin, open( "indices_important_feats_bi_bin.p", "wb" ) )
Пример #42
0
class Binarizer(TransformerMixin):
    """
    Реализует различные стратегии бинаризации признаков,
    вычисляя оптимальные пороги и производя бинаризацию с данными порогами

    Аргументы:
    ----------
    method: str('random', 'log_odds' or 'bns'), метод бинаризации признаков
    divide_to_bins: bool(optional, default=True),
        индикатор приведения количественных признаков к целочисленным
    bins_number: int(optional, default=10),
        число возможных значений целочисленных признаков при бинаризации
    """
    _UNSUPERVISED_METHODS = ['random']
    _SUPERVISED_METHODS = ['log_odds', 'bns']
    _CONTINGENCY_METHODS = ['log_odds', 'bns']

    def __init__(self, method, divide_to_bins=True, bins_number=10):
        self.method = method
        self.divide_to_bins = divide_to_bins
        self.bins_number = bins_number

    def fit(self, X, y=None):
        """
        Обучает бинаризатор на данных
        """
        # print("Fitting binarizer...")
        methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS
        if self.method not in methods:
            raise ValueError("Method should be one of {0}".format(", ".join(methods)))
        X = check_array(X, accept_sparse=['csr', 'csc'])
        if issparse(X):
            X = X.tocsc()
        if self.method in Binarizer._UNSUPERVISED_METHODS:
            self._fit_unsupervised(X)
            self.joint_thresholds_ = self.thresholds_
            self.joint_scores_ = self.scores_
        else:
            if y is None:
                raise ValueError("y must not be None for supervised binarizers.")
            # вынести в отдельную функцию
            # y = np.array(y)
            # if len(y.shape) == 1:
            #     self.classes_, y = np.unique(y, return_inverse=True)
            #     nclasses = self.classes_.shape[0]
            #     Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int)
            #     Y_new[np.arange(y.shape[0]), y] = 1
            # else:
            #     self.classes_ = np.arange(y.shape[1])
            #     Y_new = y
            label_binarizer = SK_LabelBinarizer()
            Y_new = label_binarizer.fit_transform(y)
            self.classes_ = label_binarizer.classes_
            if X.shape[0] != Y_new.shape[0]:
                raise ValueError("X and y have incompatible shapes.\n"
                                 "X has %s samples, but y has %s." %
                                 (X.shape[0], Y_new.shape[0]))
            self._fit_supervised(X, Y_new)
            if len(self.classes_) <= 2:
                self.joint_thresholds_ = self.thresholds_[:, 0]
                self.joint_scores_ = self.scores_[:, 0]
            else:
                min_class_scores = np.min(self.scores_, axis=0)
                max_class_scores = np.max(self.scores_, axis=0)
                diffs = max_class_scores - min_class_scores
                diffs[np.where(diffs == 0)] = 1
                normalized_scores = (self.scores_ - min_class_scores) / diffs
                # находим для каждого признака тот класс, для которого он наиболее полезен
                # НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ
                optimal_indexes = np.argmax(normalized_scores, axis=1)
                nfeat = self.thresholds_.shape[0]
                # в качестве порога бинаризации каждого признака
                # берём значение для класса, где он наиболее полезен
                self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes]
                self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes]
        # передаём пороги в sklearn.SK_Binarizer
        self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_)
        return self

    def transform(self, X):
        """
        Применяем бинаризатор к данным
        """
        print("Transforming binarizer...")
        if hasattr(self, 'binarize_transformer_'):
            return self.binarize_transformer_.transform(X)
        else:
            raise ValueError("Transformer is not fitted")

    def _fit_unsupervised(self, X):
        """
        Управляющая функция для методов подбора порога без учителя
        """
        if self.method == 'random':
            # случайные пороги и полезности
            if issparse(X):
                minimums = X.min(axis=0).toarray()
                maximums = X.max(axis=0).toarray()
            else:
                minimums = np.min(X, axis=0)
                maximums = np.max(X, axis=0)
            random_numbers = np.random.rand(X.shape[1], 1).reshape((X.shape[1],))
            self.thresholds_ = minimums + (maximums - minimums) * random_numbers
            self.scores_ = np.random.rand(X.shape[1], 1).reshape((X.shape[1],))
        return self

    def _fit_supervised(self, X, y):
        """
        Выполняет подбор порогов с учителем
        """
        # приводим X к целочисленным значениям, если нужно
        if self.divide_to_bins:
            bin_divider = BinDivider(bins_number=self.bins_number)
            X = bin_divider.fit_transform(X)
        thresholds, scores = [], []
        for i in range(X.shape[1]):
            threshold, score = self._find_optimal_thresholds(X[:, i], y)
            thresholds.append(threshold)
            scores.append(score)
        self.thresholds_ = np.asarray(thresholds, dtype=np.float64)
        self.scores_ = np.asarray(scores, dtype=np.float64)
        return self

    def _find_optimal_thresholds(self, column, y):
        """
        Вычисляет пороги для бинаризации

        Аргументы:
        ----------
        column: array-like, shape=(nobj,), колонка значений признаков
        y: array-like, shape=(nobj, nclasses), 0/1-матрица классов
        """
        classes_number = y.shape[1]
        # вычисляем частоты встречаемости признаков для разных классов
        values, counts = \
            _collect_column_statistics(column, y, classes_number=classes_number, precision=6)
        if self.method in Binarizer._CONTINGENCY_METHODS:
            # бинарная классификация
            if classes_number <= 2:
                counts = [counts]
            else:
                summary_counts = np.sum(counts, axis=1)
                counts = [np.array((summary_counts - counts[:, i], counts[:, i])).T
                          for i in np.arange(classes_number)]
            best_thresholds = [None] * len(counts)
            best_scores = [None] * len(counts)
            for i in np.arange(len(counts)):
                current_thresholds, current_tables = \
                    _collect_contingency_tables(values, counts[i])
                if self.method == "log_odds":
                    func = (lambda x: odds_ratio(x, alpha=0.1))
                elif self.method == 'information_gain':
                    func = information_gain
                elif self.method == 'bns':
                    func = bns
                else:
                    raise ValueError("Wrong binarization method: {0}".format(self.method))
                scores = [func(table) for table in current_tables]
                best_score_index = np.argmax(scores)
                best_thresholds[i] = current_thresholds[best_score_index]
                best_scores[i] = scores[best_score_index]
        return best_thresholds, best_scores
Пример #43
0
	def by_threshold(self, threshold=0.0):
		bin = Skbin(threshold).fit(self.M)
		return bin.transform(self.M)
print('Loading test data...')
with open('data/test-svmlight.dat') as infile:
	lines = infile.readlines()
	n_samples = len(lines)
	test = lil_matrix((n_samples, n_features))
	for n,line in enumerate(lines):
		for word_count in line.split():
			fid, count = word_count.split(':')
			test[n,int(fid)] = int(fid)
test = test.tocsr()

if opts.binarize:
	print('Binarizing the data...')
	binar = Binarizer(copy=False)
	X = binar.transform(X)
	test = binar.transform(test)

if opts.tfidf:
	print('Transforming word occurrences into TF-IDF...')
	tranny = TfidfTransformer()
	X = tranny.fit_transform(X)
	test = tranny.transform(test)

if opts.select_features:
	k_features = int(opts.k_features)
	if opts.select_features == 'k-best':
		print('Selecting %i best features...' % k_features)
		ch2 = SelectKBest(chi2, k=k_features)
	if opts.select_features == 'pct':
		print('Selecting features in the top %i percentile...' % k_features)
news_labels = extracted_data[' shares']      # Take shares column for labels

# Data Preprocessing
news_data_transpose = news_data.transpose()
data_into_dict = news_data_transpose.to_dict()
list_data = [v for k, v in data_into_dict.iteritems()]

# Encode
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
transformed_data = dv.fit_transform(list_data).toarray()

# Label Encoder - Binarization
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=1400)                           # Threshold at 1400 because median of shares is 1400
transformed_labels = binarizer.transform(news_labels)
transformed_labels = transformed_labels.transpose().ravel()     # .ravel() is to fix "Too many array indices error"
                                                                # Could be a scikit or pandas bug
############## Classification #################

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

# Decision Tree Classifier
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
gnb = GaussianNB()
# lr = LinearRegression()
Пример #46
0
		hub_ego=nx.ego_graph(Gsim,head_node,radius = 1) # step 1 only
	except NameError:
		head_node = each[0]
		hub_ego=nx.ego_graph(Gsim,head_node,radius = 1)

	index = hub_ego.nodes()
#	pec = random.uniform(0.5,0.8) # percentage of nodes selected between [0.5,0.8]
	pec = 0.8
	random.shuffle(index)
	subidx = index[:int(pec*len(index))]
	Y = np.zeros(num_sample)
	Y[::5] += 3 * (0.5-np.random.rand(num_sample/5)) # add noise to targets
	for each in subidx[1:]:
		Y += np.power(data_mat[:,each],3)
	binarizer = Binarizer()
	label = binarizer.transform(Y)

	# output the gene expression matrix
	ofp = open('nonlinear2.'+str(i)+'.genemat','w')
	for each in sorted(Gsim.nodes()):
		print >> ofp, str(each)+'\t'+'\t'.join(map(str,data_mat[:,each]))
	print >> ofp, 'outcome\t'+'\t'.join(map(str,label))
	ofp.close()
	#print 'significant network',index
	nx.write_adjlist(Gsim,"nonlinear2."+str(i)+".adjlist")
	os.system('epd_python svmnet.py -n nonlinear2.'+str(i)+'.adjlist -g nonlinear2.'+str(i)+'.genemat -o nonlinear2.svm.'+str(i)+'.txt -s 0')
#	os.system('epd_python ../rfnet.py -n nonlinear2.adjlist -g nonlinear2.genemat -o nonlinear2.rf.txt -s 0 -r 20')
	os.system('epd_python knnnet.py -n nonlinear2.'+str(i)+'.adjlist -g nonlinear2.'+str(i)+'.genemat -o nonlinear2.knn.'+str(i)+'.txt -s 0')
	svm_count += count_net('nonlinear2.svm.'+str(i)+'.txt',index)
	#rf_count += count_net('nonlinear2.rf.txt',index)
	knn_count += count_net('nonlinear2.knn.'+str(i)+'.txt',index)
Пример #47
0
# In[3]:

# Import csv data
raw_data = pd.read_csv('OnlineNewsPopularity_wLabels_deleteNoise.csv').iloc[:, 1:]      # read in csv, omit the first column of url
raw_data = raw_data.iloc[:, :-1] 
news_data = raw_data.iloc[:, :-1]      # Take up to the second last column
news_labels = raw_data.iloc[:, -1]      # Take shares column for labels

# Binarize
print '\nBinary Threshold:'
binary_threshold = np.median(raw_data[' shares'])
news_data = news_data.drop(' n_non_stop_words', 1)
print binary_threshold
binarizer = Binarizer(threshold=binary_threshold)
y_binary = binarizer.transform(news_labels).transpose().ravel() 


# In[ ]:

# Discretize


# In[25]:

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
print 'Decision Tree Classifier Accuracy Rate'
tree_score = cross_val_score(tree, news_data, y_binary, cv=10)
np.mean(tree_score)
Пример #48
0
from sklearn.preprocessing import Binarizer, LabelEncoder, OneHotEncoder

onehot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()

x = ['a', 'b', 'c']

label_x = label_encoder.fit_transform(x).reshape([len(x), 1])
print(label_x)
print(onehot_encoder.fit_transform(label_x).toarray())

binarizer = Binarizer(threshold=1.0).fit(label_x)
print(binarizer.transform(label_x))
Пример #49
0
	X_tokens = tokenizer.transform(X_train)

	# Train Recurrent Neural Network
	model = train_RNN(tokenizer, X_tokens, y_train)

	y_pred_tr = model.predict(X_tokens).flatten()

	# Check overall performance
	test_tokens = tokenizer.transform(X_test)
	y_pred_tst = model.predict(test_tokens).flatten()

	# Conver predictions to binary
	yhat_train = y_pred_tr.reshape(-1, 1)
	yhat_test  = y_pred_tst.reshape(-1, 1)
	binarizer = Binarizer(threshold=0.5).fit(yhat_train)
	yhat_tr_b = binarizer.transform(yhat_train).astype(int)
	yhat_tst_b = binarizer.transform(yhat_test).astype(int)

    save(model, review_score_full.pkl)

    with open('review_tokenizer_full.pkl', 'wb') as fileObject:
        pickle.dump(tokenizer, fileObject)

    # # Save model for future use
    # save(model, 'review_scorer1.pkl')
    # # model = load('review_scorer.pkl')
    # with open('review_tokenizer1.pkl','wb') as fileObject:
    #     pickle.dump(tokenizer, fileObject)

	# Scorers to consider
    # score()
Пример #50
0
           }

#%%
    
os.chdir(workspace)

dev_idvs_all = numpy.nan_to_num(numpy.load(dev_filename + ".npy"))
val_idvs_all = numpy.nan_to_num(numpy.load(val_filename + ".npy"))

dev_dvs = numpy.nan_to_num(numpy.load(dev_filename + "_dvs.npy"))
val_dvs = numpy.nan_to_num(numpy.load(val_filename + "_dvs.npy"))

binarizer = Binarizer(copy=True, threshold=thresh)
imputer = Imputer(copy = False)

dev_dvs_binary = binarizer.transform(dev_dvs).reshape((dev_dvs.shape[0],))
val_dvs_binary = binarizer.transform(val_dvs).reshape((val_dvs.shape[0],))

"""
from statsmodels.regression import quantile_regression

dev_idvs2 = dev_idvs[:10000,:]
inds = [i for i in xrange(dev_idvs2.shape[1]) if len(unique(dev_idvs2[:,i])) > 1]
dev_dvs2 = dev_dvs[:10000,:].reshape((10000,))

model = quantile_regression.QuantReg(dev_dvs2, dev_idvs2)
model.fit()
"""

"""
#plot(mae_dev)