Пример #1
0
    def words_killer(self, train, data, method, words_num_names_kept=50):
        if method == 'nb':
            normalizer = Binarizer()
            normalized_data = pd.DataFrame(normalizer.fit_transform(data))
            normalized_data.index = data.index
            train_data = pd.concat([normalized_data, train['label_class']],
                                   axis=1,
                                   join='inner')
            clf = BernoulliNB()
            clf.fit(train_data.drop('label_class', axis=1),
                    train_data['label_class'])
            print(
                'words killer auc: ',
                cross_val_score(clf,
                                train_data.drop('label_class', axis=1),
                                train_data['label_class'],
                                scoring='roc_auc'))
            fe = pd.Series(clf.coef_[0])
            fe.index = data.columns
            fe = fe.abs().sort_values(ascending=False)[:words_num_names_kept]
            return data[fe.index]

        if method == 'pca':
            clf = PCA(n_components=words_num_names_kept)
            train_data = pd.DataFrame(clf.fit_transform(data))
            train_data.index = data.index
            return train_data

        if method == 'lg':
            normalizer = MinMaxScaler()
            normalized_data = pd.DataFrame(normalizer.fit_transform(data))
            normalized_data.index = data.index
            train_data = pd.concat([normalized_data, train['label_class']],
                                   axis=1,
                                   join='inner')
            clf = LogisticRegression(class_weight='balanced')
            clf.fit(train_data.drop('label_class', axis=1),
                    train_data['label_class'])
            print(
                'words killer auc: ',
                cross_val_score(clf,
                                train_data.drop('label_class', axis=1),
                                train_data['label_class'],
                                scoring='roc_auc'))
            fe = pd.Series(clf.coef_[0])
            fe.index = data.columns
            fe = fe.abs().sort_values(ascending=False)[:words_num_names_kept]
            return data[fe.index]
        else:
            return data
def convert_to_classification(X_train, X_test, y_train, y_test, threshold,
                              num_features):
    #Discretize y values i.e. convert the target variable from a continuous value into categorical
    #Median target variable value used as threshold
    transformer = Binarizer(threshold)
    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)
    X_test = X_test.reshape(-1, num_features)
    X_train = X_train.reshape(-1, num_features)
    y_train_discretized = transformer.fit_transform(y_train)
    y_test_discretized = transformer.fit_transform(y_test)
    print(X_train.shape)
    print(X_test.shape)
    print(y_train_discretized.shape)
    print(y_test_discretized.shape)
    return X_train, X_test, y_train_discretized, y_test_discretized
def test_wine():
    """Sample test on the Wine UCI dataset.

    Please do note this test is _not_ conclusive,
    but the zero class is so well-separated
    that all the variations should do well on this
    specific class
    """
    wine = sklearn.datasets.load_wine()
    train_data = Normalizer().fit_transform(wine.data)
    nsample, nfeatures = train_data.shape
    bindata = np.zeros(train_data.shape)
    for i in range(nfeatures):
        binarizer = Binarizer(threshold=train_data[:, i].mean())
        bindata[:, i] = binarizer.fit_transform(train_data[:, i]
                                                .reshape(-1, 1)).reshape(1, -1)
    model = BernoulliBayesianSet(bindata, meanfactor=2,
                                 alphaepsilon=0.0001, betaepsilon=0.0001)
    some_zero_class_indices = [0, 3, 5]
    ranking = np.argsort(model.query(some_zero_class_indices))[::-1]
    top10 = ranking[:10]
    truepositives = (wine.target[top10] == 0).sum()
    precision = truepositives / 10
    # allows a single mistake
    assert precision >= 0.9
Пример #4
0
    def test_logistic_regression_cv_serializer(self):

        logistic_regression = LogisticRegressionCV(fit_intercept=True)
        logistic_regression.mlinit(input_features='a',
                                   prediction_column='e_binary')

        extract_features = ['e']
        feature_extractor = FeatureExtractor(
            input_scalars=['e'],
            output_vector='extracted_e_output',
            output_vector_items=["{}_out".format(x) for x in extract_features])

        binarizer = Binarizer(threshold=0.0)
        binarizer.mlinit(prior_tf=feature_extractor,
                         output_features='e_binary')

        Xres = binarizer.fit_transform(self.df[['a']])

        logistic_regression.fit(self.df[['a']], Xres)

        logistic_regression.serialize_to_bundle(self.tmp_dir,
                                                logistic_regression.name)

        # Test model.json
        with open("{}/{}.node/model.json".format(
                self.tmp_dir, logistic_regression.name)) as json_data:
            model = json.load(json_data)

        self.assertEqual(model['op'], 'logistic_regression')
        self.assertTrue(model['attributes']['intercept']['double'] is not None)
Пример #5
0
def one_hot_vectorize_scikitLearn(corpus):
    freq = CountVectorizer()
    corpus = freq.fit_transform(corpus)

    onehot = Binarizer()
    vector = onehot.fit_transform(corpus.toarray())
    return vector
Пример #6
0
def discretization(df, label_name):
    df_Ent = {}
    baseEntropy = calcShannonEnt(df)
    print('基本信息增益是: %f' % baseEntropy)

    predictor = [x for x in df.columns if x != label_name]

    for x in predictor:
        bestInfoGain = 0.0
        df_Ent[x] = {}

        for row in range(len(df) - 1):
            newEntropy = 0.0
            sort_df = df[[x, label_name]].sort_values(by=x, ascending=True)
            if sort_df.iloc[row, 0] == sort_df.iloc[row + 1, 0]:
                continue
            split_point = (sort_df.iloc[row, 0] + sort_df.iloc[row + 1, 0]) / 2
            bin_encoder = Binarizer(split_point)
            sort_df[x] = bin_encoder.fit_transform(
                sort_df[x].values.reshape(-1, 1))
            for value in [0, 1]:
                subdataset = sort_df[sort_df[x] == value]
                prob = len(subdataset) / float(len(sort_df))
                newEntropy += prob * calcShannonEnt(subdataset)
            infoGain = baseEntropy - newEntropy

            if infoGain > bestInfoGain:
                df_Ent[x]['best_point'] = split_point
                df_Ent[x]['Ent'] = infoGain
                bestInfoGain = infoGain

        print('%s的最佳划分点是 %f, 最大信息增益是 %f。' % (x, df_Ent[x]['best_point'],
                                             df_Ent[x]['Ent']))

    return df_Ent
Пример #7
0
def preprocess(data):
    """Function to preprocess data using steps that were used during model building"""
    
    df = pd.read_json(data)

    #seperate dataframe into numerica and categorical variables 
    #for ease of handling anf processing
    numeric_vars = df.select_dtypes(exclude = ('category','object'))
    cat_vars = df.select_dtypes(include = ('category','object'))

    #scale numeric variables
    scaler = StandardScaler()
    scaled_numeric_vars = scaler.fit_transform(numeric_vars)
    dummy_vars = pd.get_dummies(data = cat_vars)

    binarize = Binarizer()
    scaled_numeric_vars['capital_gain'] = binarize.fit_transform(scaled_numeric_vars['capital_gain'].values.reshape(-1,1))
    scaled_numeric_vars['capital_loss'] = binarize.fit_transform(scaled_numeric_vars['capital_loss'].values.reshape(-1,1))

    #get dummy variables of categorical data
    dummy_vars = pd.get_dummies(data = cat_vars)

    ## merge dataframe
    new_df = pd.merge(left=scaled_numeric_vars, right=dummy_vars, left_index=True, right_index=True)

    #select features used to train model
    transformed_df = column_data.transform(new_df)
    
    return transformed_df
Пример #8
0
def binarization(features, threshold = 0.0, is_copy = True):
    """
    DONE
    数值特征二值化
    """
    bined = Binarizer(threshold = threshold, copy = is_copy)
    transformed_data = bined.fit_transform(features)
    return transformed_data
Пример #9
0
def getTrigramEncoding(text_array):
    freq = CountVectorizer(ngram_range=(3, 3), analyzer='char_wb')  # trigram
    corpus_trigrams = freq.fit_transform(text_array)

    onehot = Binarizer()
    corpus_trigrams_one_hot = onehot.fit_transform(corpus_trigrams.toarray())

    return freq, corpus_trigrams_one_hot
Пример #10
0
def binarizeMatrix(dataMatrix, threshold):
    """
    Transforms all the inputs to either 0/1 . <0 Maps to 0. >1 Maps 1. [0,1] depends on the threshold you set between [0,1]
    """

    binarizer = Binarizer(threshold=threshold)

    dataMatrix = binarizer.fit_transform(dataMatrix)

    return dataMatrix
def numeric2binary_preprocessor(df, binary_cols, threshold=0.0):
    binarized_cols = {}
    for col in binary_cols:
        binarizer = Binarizer(threshold=threshold, copy=False)
        df[col + "_binary"] = pd.DataFrame(binarizer.fit_transform(df[[col]]),
                                           index=df.index)
        df.drop(col, axis=1, inplace=True)
        binarized_cols[col] = binarizer

    return binarized_cols
def getPresenceFeatures(data):
    vectorizer = CountVectorizer(
        analyzer='word',
        lowercase=False,
    )
    features2 = vectorizer.fit_transform(data)  #.toarray() #Unigram features

    bin = Binarizer()
    presenceFeatures = bin.fit_transform(features2)
    return presenceFeatures, vectorizer
Пример #13
0
def preprocess(logit, label):
    logit = toone(logit)
    if logit[-1][Data.intentdict[1]['none']] > 0.5 and label[-1][
            Data.intentdict[1]['none']] > 0.5:
        logit[-1][Data.intentdict[1]['none']] = int(0)
        label[-1][Data.intentdict[1]['none']] = int(0)
    bin = Binarizer(threshold=0.2)
    logit = bin.fit_transform([logit[-1]])
    label = bin.fit_transform([label[-1]])
    return logit[-1], label[-1]
def getTrigramFeatures(data):
    vectorizer = CountVectorizer(
        analyzer='word',
        lowercase=False,
        ngram_range=(1, 3),
    )
    features = vectorizer.fit_transform(data)

    bin = Binarizer()
    tgFt = bin.fit_transform(features)
    return tgFt, vectorizer
Пример #15
0
def BinOutcome(dataset): 
    combatPts = []
    for poke in dataset:
        combatPts.append(poke.ptOut)

    meanPtOut = np.mean(combatPts)
    combatPts = np.array(combatPts)
    combatPts = combatPts.reshape(1, -1)

    binarizerP = Binarizer(threshold=meanPtOut)
    return binarizerP.fit_transform(combatPts)
def preprocess(logits, labels):
    logits = toone(logits)
    bin = Binarizer(threshold=0.2)
    for i in range(len(logits)):
        if logits[i][Data.intentdict[1]['none']] > 0.5 and labels[i][
                Data.intentdict[1]['none']] > 0.5:
            logits[i][Data.intentdict[1]['none']] = int(0)
            labels[i][Data.intentdict[1]['none']] = int(0)
    logits = bin.fit_transform(logits)
    labels = bin.fit_transform(labels)
    return logits.flatten(), labels.flatten()
Пример #17
0
 def __labelBinarizer(self,threshold):
     """---labelBinarizer-----------------------------------------
     Values greater than the threshold map to 1, 
     while values less than or equal to the threshold map to 0. 
     ---Parameters
     threshold : float
      ---Return
     None
     ------------------------------------------------------"""
     from sklearn.preprocessing import Binarizer
     binarizer = Binarizer(threshold=threshold)
     self.__yData = binarizer.fit_transform(self.__yData)
Пример #18
0
def calculate_score(predict_output, ground_truth, talker=None):
    test_talker = open('Data/test/talker', 'r').readlines()
    ret_pred_outputs = list()
    ret_ground_truth = list()
    talker_cnt = -1
    for pred, label in zip(predict_output, ground_truth):
        talker_cnt += 1
        if len(test_talker) <= talker_cnt:
            talker_cnt = len(test_talker) - 1
        if test_talker[talker_cnt].strip('\n') != talker and talker != 'ALL':
            continue
        pred_act = pred[:5] # first 5 is act
        pred_attribute = pred[5:] # remaining is attribute
        binary = Binarizer(threshold=0.5)
        act_logit = one_hot(np.argmax(pred_act), "act")
        attribute_logit = binary.fit_transform([pred_attribute])
        if np.sum(attribute_logit) == 0:
            attribute_logit = one_hot(np.argmax(pred_attribute), "attribute")
        label = binary.fit_transform([label])
        ret_pred_outputs = np.append(ret_pred_outputs, np.append(act_logit, attribute_logit))
        ret_ground_truth = np.append(ret_ground_truth, label)
    return ret_pred_outputs, ret_ground_truth
Пример #19
0
def sklearn_one_hot_vectorize(corpus):
    # The Sklearn one hot vectorize method

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.preprocessing import Binarizer

    freq = CountVectorizer()
    vectors = freq.fit_transform(corpus)

    print(len(vectors.toarray()[0]))

    onehot = Binarizer()
    vectors = onehot.fit_transform(vectors.toarray())

    print(len(vectors[0]))
Пример #20
0
def wine_quality_white():
    # white wine quality dataset

    filename = '../../data/raw/mldata/winequality-white.csv'

    # The data corresponds to the 11 first column of the csv file
    data = np.loadtxt(filename, usecols=tuple(range(11)), delimiter=';', dtype=float)
    # Read the label
    # We need to binarise the label using a threshold at 4
    bn = Binarizer(threshold=4)
    label = bn.fit_transform(np.loadtxt(filename, usecols=(11,), delimiter=';', dtype=int))
    # We need to inverse the label -> 1=0 and 0=1
    label = np.ravel(np.abs(label - 1))
    
    np.savez('../../data/clean/uci-wine-quality-white.npz', data=data, label=label)
Пример #21
0
def do_logreg():
    from sklearn.preprocessing import Binarizer, scale
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.cross_validation import train_test_split
    from sklearn.cross_validation import cross_val_score
    from sklearn.grid_search import GridSearchCV
    from scipy.stats import expon
    import pandas
    ### load data
    col_names = [
        'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
        'acceleration', 'model_year', 'origin', 'car_name'
    ]
    df = pandas.read_csv('auto_mpg.csv')
    df.columns = col_names
    df = df.drop('car_name', 1)

    lr = LogisticRegression()
    bn = Binarizer(threshold=df['mpg'].mean())
    print "Performing binarization of the mpg variable into above/below average classes"
    target = bn.fit_transform(df['mpg'])
    data = df.drop('mpg', 1)
    data = scale(data)
    print "Splitting into training and test sets"
    data_train, data_test, target_train, target_test = train_test_split(
        data, target, test_size=0.5, random_state=0)

    grid = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    print 'Searching for optimal C in {} using {}-fold validation on test set '.format(
        grid, nfolds)
    tuned_parameters = [{'C': grid}]
    clf = GridSearchCV(lr, tuned_parameters, cv=nfolds, scoring='accuracy')
    clf.fit(data_train, target_train)
    for params, mean_score, _ in clf.grid_scores_:
        print "{}: Mean accuracy {}".format(params, mean_score)

    print """Cross-validating above/below average mpg prediction
        using {}-fold validation on the test dataset.
        Using the best estimator: {}
        """.format(nfolds, clf.best_estimator_)

    mean_cross = np.mean(
        cross_val_score(clf.best_estimator_, data_test, target_test,
                        cv=nfolds))

    print "Mean cross-validated accuracy after optimization is: {}".format(
        mean_cross)
Пример #22
0
def intentpreprocess(logit, label):
    logit = transform_to_onehot(logit)
    #1xint
    #because sometimes attribute won't have any, so i establish a none option for model to determine weather to
    #choose a valid or not
    #but a none choose can't be counted as a label
    #this indicate that if both none, then good
    #but if label is none and logit is not, then will be negative positive
    if logit[-1][Data.intentdict[1]['none']] > 0.5 and label[-1][
            Data.intentdict[1]['none']] > 0.5:
        logit[-1][Data.intentdict[1]['none']] = int(0)
        label[-1][Data.intentdict[1]['none']] = int(0)
    else:
        label[-1][Data.intentdict[1]['none']] = 0
    bin = Binarizer(threshold=0.2)
    logit = bin.fit_transform(logit)
    label = bin.fit_transform(label)
    return logit[-1], label[-1]
Пример #23
0
def main():
    datasets = gen_datasets()
    print "origin data:"
    print datasets

    #0均值,单位方差
    standard_scaler = StandardScaler()
    scaler_datasets = standard_scaler.fit_transform(datasets)
    print scaler_datasets
    print "-" * 80

    min_max_scaler = MinMaxScaler()
    scaler_datasets = min_max_scaler.fit_transform(datasets)
    print scaler_datasets
    print "-" * 80

    max_abs_scaler = MaxAbsScaler()
    scaler_datasets = max_abs_scaler.fit_transform(datasets)
    print scaler_datasets
    print "-" * 80

    normalize = Normalizer(norm="l1")
    normalize_datasets = normalize.fit_transform(datasets)
    print normalize_datasets
    print "-" * 80

    binarizer = Binarizer(threshold=1.1)
    binarizer_datasets = binarizer.fit_transform(datasets)
    print binarizer_datasets
    print "-" * 80

    one_hot_encoder = OneHotEncoder()
    one_hot_encoder_datasets = one_hot_encoder.fit_transform([[0, 1, 4],
                                                              [1, 2, 0],
                                                              [2, 3, 5]])
    print one_hot_encoder_datasets.toarray()
    print "-" * 80

    imputer = Imputer(missing_values=0, strategy="median")
    imputer_datasets = imputer.fit_transform(datasets)
    print imputer_datasets
    print imputer.statistics_
Пример #24
0
def do_logreg():
    from sklearn.preprocessing import Binarizer, scale
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score,classification_report
    from sklearn.cross_validation import train_test_split
    from sklearn.cross_validation import cross_val_score
    from sklearn.grid_search import GridSearchCV
    from scipy.stats import expon
    import pandas
    ### load data
    col_names=['mpg','cylinders','displacement','horsepower','weight',
               'acceleration','model_year','origin','car_name']
    df=pandas.read_csv('auto_mpg.csv')
    df.columns=col_names
    df=df.drop('car_name',1)
    
    lr=LogisticRegression()
    bn=Binarizer(threshold=df['mpg'].mean())
    print "Performing binarization of the mpg variable into above/below average classes"
    target=bn.fit_transform(df['mpg'])
    data=df.drop('mpg',1)
    data=scale(data)
    print "Splitting into training and test sets"
    data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.5,random_state=0)

    grid=[0.001, 0.01, 0.1, 1, 10, 100, 1000]
    print 'Searching for optimal C in {} using {}-fold validation on test set '.format(grid,nfolds)
    tuned_parameters=[{'C':grid}]
    clf=GridSearchCV(lr,tuned_parameters,cv=nfolds,scoring='accuracy')
    clf.fit(data_train,target_train)
    for params, mean_score,_ in clf.grid_scores_:
        print "{}: Mean accuracy {}".format(params,mean_score)

    
    print  """Cross-validating above/below average mpg prediction
        using {}-fold validation on the test dataset.
        Using the best estimator: {}
        """.format(nfolds,clf.best_estimator_)
        
    mean_cross=np.mean(cross_val_score(clf.best_estimator_,data_test,target_test,cv=nfolds))

    print "Mean cross-validated accuracy after optimization is: {}".format(mean_cross)
Пример #25
0
def us_crime():
    # US crime dataset

    filename = '../../data/raw/mldata/communities.data'

    # The missing data will be consider as NaN
    # Only use 122 continuous features
    tmp_data = np.genfromtxt(filename, delimiter = ',')
    tmp_data = tmp_data[:, 5:]

    # replace missing value by the mean
    imp = Imputer(verbose = 1)
    tmp_data = imp.fit_transform(tmp_data)

    # extract the data to be saved
    data = tmp_data[:, :-1]
    bn = Binarizer(threshold=0.65)
    label = np.ravel(bn.fit_transform(tmp_data[:, -1]))

    np.savez('../../data/clean/uci-us-crime.npz', data=data, label=label)
Пример #26
0
    def test_logistic_regression_cv_deserializer(self):

        logistic_regression = LogisticRegressionCV(fit_intercept=True)
        logistic_regression.mlinit(input_features='a',
                                   prediction_column='e_binary')

        extract_features = ['e']
        feature_extractor = FeatureExtractor(
            input_scalars=['e'],
            output_vector='extracted_e_output',
            output_vector_items=["{}_out".format(x) for x in extract_features])

        binarizer = Binarizer(threshold=0.0)
        binarizer.mlinit(prior_tf=feature_extractor,
                         output_features='e_binary')

        Xres = binarizer.fit_transform(self.df[['a']])

        logistic_regression.fit(self.df[['a']], Xres)

        logistic_regression.serialize_to_bundle(self.tmp_dir,
                                                logistic_regression.name)

        # Test model.json
        with open("{}/{}.node/model.json".format(
                self.tmp_dir, logistic_regression.name)) as json_data:
            model = json.load(json_data)

        # Now deserialize it back
        node_name = "{}.node".format(logistic_regression.name)
        logistic_regression_tf = LogisticRegressionCV()
        logistic_regression_tf = logistic_regression_tf.deserialize_from_bundle(
            self.tmp_dir, node_name)

        res_a = logistic_regression.predict(self.df[['a']])
        res_b = logistic_regression_tf.predict(self.df[['a']])

        self.assertEqual(res_a[0], res_b[0])
        self.assertEqual(res_a[1], res_b[1])
        self.assertEqual(res_a[2], res_b[2])
def main():
    raw_datasets, _ = Datasets.load_datasets()
    X, Y = gen_datasets(raw_datasets)

    vectorizer = CountVectorizer(decode_error="ignore")
    cv_datasets = vectorizer.fit_transform(X).toarray()

    clf = ExtraTreesClassifier()
    clf = clf.fit(cv_datasets, Y)
    print cv_datasets.shape

    print clf.feature_importances_

    modle = SelectFromModel(clf, prefit=True)
    X_new = modle.transform(cv_datasets)
    print X_new.shape

    binarizer = Binarizer(threshold=1.0)
    b_datasets = binarizer.fit_transform(cv_datasets)
    variance_threshold = VarianceThreshold(.8 * (1 - .8))
    v_datasets = variance_threshold.fit_transform(b_datasets)
    print v_datasets.shape
Пример #28
0
    def test_logistic_regression_cv_serializer(self):

        logistic_regression = LogisticRegressionCV(fit_intercept=True)
        logistic_regression.mlinit(input_features=['a', 'b', 'c'],
                                   prediction_column=['e_binary'])

        binarizer = Binarizer(threshold=0.0)
        binarizer.mlinit(input_features=['e'], output_features=['e_binary'])

        Xres = binarizer.fit_transform(self.df[['a']])

        logistic_regression.fit(self.df[logistic_regression.input_features],
                                Xres)

        logistic_regression.serialize_to_bundle(self.tmp_dir,
                                                logistic_regression.name)

        # Test model.json
        with open("{}/{}.node/model.json".format(
                self.tmp_dir, logistic_regression.name)) as json_data:
            model = json.load(json_data)

        self.assertEqual(model['op'], 'logistic_regression')
        self.assertTrue(model['attributes']['intercept']['value'] is not None)
Пример #29
0
    def test_logistic_regression_cv_deserializer(self):

        logistic_regression = LogisticRegressionCV(fit_intercept=True)
        logistic_regression.mlinit(input_features=['a', 'b', 'c'],
                                   prediction_column=['e_binary'])

        binarizer = Binarizer(threshold=0.0)
        binarizer.mlinit(input_features=['e'], output_features=['e_binary'])

        Xres = binarizer.fit_transform(self.df[['a']])

        logistic_regression.fit(self.df[logistic_regression.input_features],
                                Xres)

        logistic_regression.serialize_to_bundle(self.tmp_dir,
                                                logistic_regression.name)

        # Test model.json
        with open("{}/{}.node/model.json".format(
                self.tmp_dir, logistic_regression.name)) as json_data:
            model = json.load(json_data)

        # Now deserialize it back
        node_name = "{}.node".format(logistic_regression.name)
        logistic_regression_tf = LogisticRegressionCV()
        logistic_regression_tf = logistic_regression_tf.deserialize_from_bundle(
            self.tmp_dir, node_name)

        res_a = logistic_regression.predict(
            self.df[logistic_regression.input_features])
        res_b = logistic_regression_tf.predict(
            self.df[logistic_regression_tf.input_features])

        self.assertEqual(res_a[0], res_b[0])
        self.assertEqual(res_a[1], res_b[1])
        self.assertEqual(res_a[2], res_b[2])
Пример #30
0
def sparsityMeasure(loadPath, prefix):
    X, y = static_load_csr(loadPath)
    X_pos = X[y == 1,:]
    X_neg = X[y == 0,:]
    
    mean_traffic_pos = np.sum( np.sum(X_pos, axis = 1) ) *1.0/ X_pos.shape[0]
    mean_traffic_neg = np.sum( np.sum(X_neg, axis = 1) ) *1.0/ X_neg.shape[0]
    
    binarizer = Binarizer()
    X_pos = binarizer.fit_transform(X_pos)
    X_neg = binarizer.fit_transform(X_neg)
    
    mean_domains_pos = np.sum( np.sum(X_pos, axis = 1) ) *1.0/ X_pos.shape[0]
    mean_domains_neg = np.sum( np.sum(X_neg, axis = 1) ) *1.0/ X_neg.shape[0]
    
    print 'mean_traffic_pos : ' + str(mean_traffic_pos)
    print 'mean_traffic_neg : ' + str(mean_traffic_neg)
    print 'mean_domains_pos : ' + str(mean_domains_pos)
    print 'mean_domains_neg : ' + str(mean_domains_neg)
    
    overall_traffic = (mean_traffic_pos * X_pos.shape[0] + mean_traffic_neg * X_neg.shape[0])*1.0/ X.shape[0]
    overall_domains = (mean_domains_pos * X_pos.shape[0] + mean_domains_neg * X_neg.shape[0])*1.0/ X.shape[0]
    print ' overall_traffic : ' + str( overall_traffic)
    print 'overall_domains : ' + str(overall_domains)
def data_process(data, process_type):  # 特征处理
    if process_type == "Binary":  # 二值化处理
        processmodule = Binarizer(copy=True, threshold=0.0)
        # 大于 threshold 的映射为1, 小于 threshold 的映射为0

    elif process_type == "MinMax":  # 归一化处理
        processmodule = MinMaxScaler(feature_range=(0, 1), copy=True)

    elif process_type == "Stand":  # 标准化处理
        processmodule = StandardScaler(copy=True,
                                       with_mean=True,
                                       with_std=True)

    elif process_type == "Normal":
        processmodule = Normalizer(copy=True, norm="l2")

    elif process_type == "MultiLabelBinar":  # 多标签2值话处理
        processmodule = MultiLabelBinarizer(sparse_output=True)

    else:
        raise ValueError("please select a correct process_type")

    result = processmodule.fit_transform(data)
    return result
# Load libraries
from sklearn.preprocessing import Binarizer
import numpy as np

# Create feature
age = np.array([[6],
                [12],
                [20],
                [36],
                [65]])
# Create binarizer
binarizer = Binarizer(18)

# Transform feature
bn = binarizer.fit_transform(age)
print(bn)
d_tokens = file[10]
d_values = file[11]

from sklearn.feature_extraction.text import CountVectorizer

vectorizerToken = CountVectorizer(input='filename')
vectors = vectorizerToken.fit_transform(d_tokens)

print(len(vectors.toarray()))

res_1 = vectors.toarray()

from sklearn.preprocessing import Binarizer

onehot = Binarizer()
corpus = onehot.fit_transform(vectors.toarray())

res_2 = corpus

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(input='filename')
vec_tf_idf = tfidf.fit_transform(d_tokens)

res_3 = vec_tf_idf.toarray()

feature_names = vectorizerToken.get_feature_names()
res_1_names = ["count_" + feature for feature in feature_names]
res_2_names = ["bin_" + feature for feature in feature_names]
res_3_names = ["tfidf_" + feature for feature in feature_names]
    # Transform -1 in 0 and take spin up as standard configuration
    binarizer = Binarizer(threshold=0)
    keys = list(datah5.keys())
    # put here the temperature from keys that you want to use for the training
    #class_names = [keys[i] for i in [4, 6, 7, 8, 9, 10, 11, 12, 16]]
    class_names = [keys[i] for i in [4, 10, 16]]
    n_samples = datah5[keys[0]].shape[0]
    datah5_norm = {}
    data_bin = {}
    for key in keys:
        datah5_norm[key] = np.array([
            np.where(np.sum(slice) < 0, -slice, slice) for slice in datah5[key]
        ])
        data_bin[key] = np.array(
            [binarizer.fit_transform(slice) for slice in datah5_norm[key]])

    # class labels even if they are not really useful here
    class_labels = np.asarray(
        list(
            itertools.chain.from_iterable(
                itertools.repeat(x, n_samples)
                for x in range(0, len(class_names)))))
    one_hot_labels = np.zeros((len(class_labels), len(class_names)))
    one_hot_labels[np.arange(len(class_labels)), class_labels] = 1

    data = data_bin[class_names[0]]
    for temperature in class_names[1:]:
        data = np.concatenate([data, data_bin[temperature]])

    radii = [0, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14, 16]
Пример #35
0
#
#	Comment section below out if you already have made pickle files
#
#---------------------------------------------------------------------------------------

all_bigr = ngram(X_train, 'bigram') #starting with all features

print "Starting counting bigrams..."
X_train_bi_counted = count(X_train, all_bigr, 'bigram')
print "Done counting train set"
X_test_bi_counted = count(X_test, all_bigr, 'bigram')
print "Done counting test set"

print "Binarizing and dumping files"
bin = Binarizer()
X_train_bi_binary = bin.fit_transform(X_train_bi_counted)
X_test_bi_binary = bin.transform(X_test_bi_counted)
pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) )
pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) )
print "Done"


print "Starting tfidf vectors..."
X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted)
pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) )
pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) )
print "Done"


print "Starting feature selection using CART random forests on binary files"
indices_important_feats_bi_bin = tree(X_train_bi_binary, y_train, all_bigr, 'Bigram_binary')
Пример #36
0
def binarize(img, threshold):
    binarizer = Binarizer(threshold, copy=False)
    return binarizer.fit_transform(img)
 def fp_vectorizer(self, processed_data):
     binarizer = Binarizer(threshold = 5)
     vectorized_data = binarizer.fit_transform(processed_data)
     return vectorized_data
Пример #38
0
def load(opt='custom', x_filename=None, y_filename=None, n_samples=0,
         samples_on='rows', **kwargs):
    """Load a specified dataset.

    This function can be used either to load one of the standard scikit-learn
    datasets or a different dataset saved as X.npy Y.npy in the working
    directory.

    Parameters
    -----------
    opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons',
          'custom', 'GSEXXXXX'}, default: 'custom'
        Name of a predefined dataset to be loaded. 'iris', 'digits', 'diabetes'
        'boston', 'circles' and 'moons' refer to the correspondent
        `scikit-learn` datasets. 'custom' can be used to load a custom dataset
        which name is specified in `x_filename` and `y_filename` (optional).

    x_filename : string, default : None
        The data matrix file name.

    y_filename : string, default : None
        The label vector file name.

    n_samples : int
        The number of samples to be loaded. This comes handy when dealing with
        large datasets. When n_samples is less than the actual size of the
        dataset this function performs a random subsampling that is stratified
        w.r.t. the labels (if provided).

    samples_on : string
        This can be either in ['row', 'rows'] if the samples lie on the row of
        the input data matrix, or viceversa in ['col', 'cols'] the other way
        around.

    data_sep : string
        The data separator. For instance comma, tab, blank space, etc.

    Returns
    -----------
    X : array of float, shape : n_samples x n_features
        The input data matrix.

    y : array of float, shape : n_samples
        The label vector; np.nan if missing.

    feature_names : array of integers (or strings), shape : n_features
        The feature names; a range of number if missing.

    index : list of integers (or strings)
        This is the samples identifier, if provided as first column (or row) of
        of the input file. Otherwise it is just an incremental range of size
        n_samples.
    """
    data = None
    try:
        if opt.lower() == 'iris':
            data = datasets.load_iris()
        elif opt.lower() == 'digits':
            data = datasets.load_digits()
        elif opt.lower() == 'diabetes':
            data = datasets.load_diabetes()
            b = Binarizer(threshold=np.mean(data.target))
            data.target = b.fit_transform(data.data)
        elif opt.lower() == 'boston':
            data = datasets.load_boston()
            b = Binarizer(threshold=np.mean(data.target))
            data.target = b.fit_transform(data.data)
        elif opt.lower() == 'gauss':
            means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]])
            sigmas = np.array([0.33, 0.33, 0.33])
            if n_samples <= 1:
                n_samples = 333
            xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'circles':
            if n_samples == 0:
                n_samples = 400
            xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3,
                                           noise=.05)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'moons':
            if n_samples == 0:
                n_samples = 400
            xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01)
            data = datasets.base.Bunch(data=xx, target=yy)
        elif opt.lower() == 'custom':
            data = load_custom(x_filename, y_filename, samples_on, **kwargs)
        elif opt.lower().startswith('gse'):
            raise Exception("Use ade_GEO2csv.py to convert GEO DataSets"
                            "into csv files.")
    except IOError as e:
        print("I/O error({0}): {1}".format(e.errno, e.strerror))

    X, y = data.data, data.target
    if n_samples > 0 and X.shape[0] > n_samples:
        if y is not None:
            try:  # Legacy for sklearn
                sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1)
                # idx = np.random.permutation(X.shape[0])[:n_samples]
            except TypeError:
                sss = StratifiedShuffleSplit(test_size=n_samples) \
                    .split(X, y)
            _, idx = list(sss)[0]
        else:
            idx = np.arange(X.shape[0])
            np.random.shuffle(idx)
            idx = idx[:n_samples]

        X, y = X[idx, :], y[idx]
    else:
        # The length of index must be consistent with the number of samples
        idx = np.arange(X.shape[0])

    feat_names = data.feature_names if hasattr(data, 'feature_names') \
        else np.arange(X.shape[1])
    index = np.array(data.index)[idx] if hasattr(data, 'index') \
        else np.arange(X.shape[0])

    return X, y, feat_names, index