Пример #1
0
def reblog_prediction(client, id_num):
    id = int(id_num) #enter id number to start from on dashboard
    x = client.dashboard(reblog_info=True, since_id=id)[u'posts']
    #load our saved model
    model = load_model('my_model.h5')

    # load tokenizer
    tokenizer = Tokenizer()
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

    encoder = LabelBinarizer()
    encoder.classes_ = ["reblog", "not on blog"]

    labels = np.array(["reblog", "not on blog"])

    x_data = []

    length = len(x)

    for i in range(0, length):
        str = clean_string(x[i]['blog_name'] + " " + x[i][u"summary"])
        x_data.append(str)

    x_data_series = pd.Series(x_data)
    x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf')

    i=0
    for x_t in x_tokenized:
        prediction = model.predict(np.array([x_t]))
        predicted_label = labels[np.argmax(prediction[0])]
        print("Summary: " + x_data[i] +  "\nPredicted label: " + predicted_label)
        i += 1
Пример #2
0
    def _fit_data(self, X):
        """Binarize the data for each column separately.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        X_transformed : array-like
            Returns the data where in each columns the labels are
            binarized.

        """

        if self.binarize is not None:
            X = binarize(X, threshold=self.binarize)

        for i in range(X.shape[1]):

            # initialise binarizer and save
            binarizer = LabelBinarizer()

            if self.binarize:
                binarizer.classes_ = np.array([0, 1])

            # fit the data to the binarizer
            binarizer.fit(X[:, i])

            self._binarizers.append(binarizer)

        return self._transform_data(X)
Пример #3
0
def binarizer_from_classifier(clf):
    """
    returns a LabelBinarizer with the position of each class
    corresponding to that of an input classifier
    """
    lb = LabelBinarizer()
    lb.multilabel_ = False
    lb.classes_ = clf.classes_
    return lb
Пример #4
0
def binarizer_from_classifier(clf):
    """
    returns a LabelBinarizer with the position of each class
    corresponding to that of an input classifier
    """
    lb = LabelBinarizer()
    lb.multilabel_ = False
    lb.classes_ = clf.classes_
    return lb
def deserialize_label_binarizer(label_binarizer_dict):
    label_binarizer = LabelBinarizer()
    label_binarizer.neg_label = label_binarizer_dict['neg_label']
    label_binarizer.pos_label = label_binarizer_dict['pos_label']
    label_binarizer.sparse_output = label_binarizer_dict['sparse_output']
    label_binarizer.y_type_ = label_binarizer_dict['y_type_']
    label_binarizer.sparse_input_ = label_binarizer_dict['sparse_input_']
    label_binarizer.classes_ = np.array(label_binarizer_dict['classes_'])

    return label_binarizer
Пример #6
0
    def test_sklearn_preinit(self):

        m = np.array([1.0, .81, .85, .81, .85, .81])
        u = np.array([1.0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(1000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)

        binarizer = LabelBinarizer()
        binarizer.classes_ = np.array([0, 1])

        binarizer.transform(X_train.iloc[:, 1])
        assert len(binarizer.classes_) == 2
Пример #7
0
    def fit(self, X_):
        catCols = [
            colname for colname in X_.columns if X_[colname].dtype == 'object'
        ]

        for col in self.drop:
            catCols.remove(col)

        self.catCols = catCols

        for col in catCols:
            encoder = RobustLabelEncoder()
            tmp = encoder.fit_transform(X_[col].tolist())
            self.encoders[col] = encoder
            if self.onehot:
                oh_encoder = LabelBinarizer()  #不训练
                oh_encoder.classes_ = np.array(range(len(encoder.classes_)))
                #print(encoder.classes_)
                self.oh_encoders[col] = oh_encoder
        return self
Пример #8
0
def load_encoders():
    """Loads the encoders built during `build_encoders`.

    # Returns
        encoders: A dict of encoder objects/specs.
    """

    encoders = {}

    # Text
    tokenizer = CountVectorizer(max_features=10000)

    module = "webcompat_ml.models.invalid.encoders"
    filename = "model_vocab.json"
    with importlib_path(module, filename) as path:
        with open(path, "r", encoding="utf8", errors="ignore") as infile:
            tokenizer.vocabulary_ = json.load(infile)
    encoders["tokenizer"] = tokenizer

    # labels
    labels_encoder = LabelBinarizer()

    filename = "labels_encoder.json"
    with importlib_path(module, filename) as path:
        with open(path, "r", encoding="utf8", errors="ignore") as infile:
            labels_encoder.classes_ = json.load(infile)
    encoders["labels_encoder"] = labels_encoder

    # Target Field: invalid
    invalid_encoder = LabelEncoder()

    filename = "invalid_encoder.json"
    with importlib_path(module, filename) as path:
        with open(path, "r", encoding="utf8", errors="ignore") as infile:
            invalid_encoder.classes_ = np.array(json.load(infile))
    encoders["invalid_encoder"] = invalid_encoder

    return encoders
def test_serialize_model():
    instance = HostFootprint()
    model = MLPClassifier()
    label_binarizer = LabelBinarizer()
    label_binarizer.neg_label = 0
    label_binarizer.pos_label = 1
    label_binarizer.sparse_output = False
    label_binarizer.y_type_ = "binary"
    label_binarizer.sparse_input_ = False
    label_binarizer.classes_ = np.array([0])

    parameters = {'hidden_layer_sizes': [(64, 32)]}
    GridSearchCV(model, parameters, cv=5, n_jobs=-1, scoring='f1_weighted')

    model.coefs_ = np.array([[1], [2]])
    model.loss_ = 42
    model.intercepts_ = np.array([[3], [4]])
    model.classes_ = np.array([[5], [6]])
    model.n_iter_ = 42
    model.n_layers_ = 2
    model.n_outputs_ = 1
    model.out_activation_ = "logistic"
    model._label_binarizer = label_binarizer
    model.features = ['test_1', 'test_2', 'test_3']

    with tempfile.TemporaryDirectory() as tmpdir:
        model_file = os.path.join(tmpdir, 'host_footprint.json')
        instance.serialize_model(model, model_file)
        new_model = instance.deserialize_model(model_file)
        assert model.features == new_model.features
        print(f"model params: {model.get_params()}")
        print(f"new_model params: {new_model.get_params()}")
        assert len(model.get_params()['hidden_layer_sizes']) == len(
            new_model.get_params()['hidden_layer_sizes'])
        assert model._label_binarizer.y_type_ == new_model._label_binarizer.y_type_
        assert len(model.coefs_) == len(new_model.coefs_)
        assert len(model.intercepts_) == len(new_model.intercepts_)
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(train_data.raw_labels)
y_train = multilabel_binarizer.transform(train_data.raw_labels)
y_test = multilabel_binarizer.transform(test_data.raw_labels)
train_rfe = train_data[list(range(3, 30)) + ["read_num"]]
test_rfe = test_data[list(range(3, 30)) + ["read_num"]]
n_calsses = len(multilabel_binarizer.classes_)

ind_ids = {
    ind_id: i
    for i, ind_id in enumerate(df_full.individual_id.unique())
}
ind_id_binarizer = LabelBinarizer()
#for efficiency we use our version of binarizer
ind_id_binarizer.classes_ = np.array(list(ind_ids.keys()))
# In[214]:
timer.report_milestone("Preprocessing")

#        ------------Model-----------

#Drop out level
DPLVL = 0.02

DENSE_RELU_LAYERS = 6

#Dense layer size
DNSS = 256

#Amount of groups [msid,individual_id (i.e. patient)] per batch
BATCH_G_S = 300
Пример #11
0
def load_encoders():
    """Loads the encoders built during `build_encoders`.

    # Returns
        encoders: A dict of encoder objects/specs.
    """

    encoders = {}

    # make
    make_encoder = LabelBinarizer()
    make_encoder.classes_ = list(range(4))

    with open(os.path.join('encoders', 'make_bins.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        make_bins = json.load(infile)
    encoders['make_bins'] = make_bins
    encoders['make_encoder'] = make_encoder

    # body
    body_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'body_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        body_encoder.classes_ = json.load(infile)
    encoders['body_encoder'] = body_encoder

    # mileage
    mileage_encoder = LabelBinarizer()
    mileage_encoder.classes_ = list(range(4))

    with open(os.path.join('encoders', 'mileage_bins.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        mileage_bins = json.load(infile)
    encoders['mileage_bins'] = mileage_bins
    encoders['mileage_encoder'] = mileage_encoder

    # engV
    engv_encoder = LabelBinarizer()
    engv_encoder.classes_ = list(range(4))

    with open(os.path.join('encoders', 'engv_bins.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        engv_bins = json.load(infile)
    encoders['engv_bins'] = engv_bins
    encoders['engv_encoder'] = engv_encoder

    # engType
    engtype_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'engtype_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        engtype_encoder.classes_ = json.load(infile)
    encoders['engtype_encoder'] = engtype_encoder

    # registration
    registration_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'registration_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        registration_encoder.classes_ = json.load(infile)
    encoders['registration_encoder'] = registration_encoder

    # year
    year_encoder = LabelBinarizer()
    year_encoder.classes_ = list(range(4))

    with open(os.path.join('encoders', 'year_bins.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        year_bins = json.load(infile)
    encoders['year_bins'] = year_bins
    encoders['year_encoder'] = year_encoder

    # drive
    drive_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'drive_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        drive_encoder.classes_ = json.load(infile)
    encoders['drive_encoder'] = drive_encoder

    # Target Field: price

    return encoders
Пример #12
0
def load_encoders():
    """Loads the encoders built during `build_encoders`.

    # Returns
        encoders: A dict of encoder objects/specs.
    """

    encoders = {}

    # Pclass
    pclass_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'pclass_encoder.json'),
              'r', encoding='utf8', errors='ignore') as infile:
        pclass_encoder.classes_ = json.load(infile)
    encoders['pclass_encoder'] = pclass_encoder

    # Sex
    sex_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'sex_encoder.json'),
              'r', encoding='utf8', errors='ignore') as infile:
        sex_encoder.classes_ = json.load(infile)
    encoders['sex_encoder'] = sex_encoder

    # Age
    age_encoder = LabelBinarizer()
    age_encoder.classes_ = list(range(10))

    with open(os.path.join('encoders', 'age_bins.json'),
              'r', encoding='utf8', errors='ignore') as infile:
        age_bins = json.load(infile)
    encoders['age_bins'] = age_bins
    encoders['age_encoder'] = age_encoder

    # Siblings/Spouses Aboard
    siblings_spouses_aboard_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'siblings_spouses_aboard_encoder.json'),
              'r', encoding='utf8', errors='ignore') as infile:
        siblings_spouses_aboard_encoder.classes_ = json.load(infile)
    encoders['siblings_spouses_aboard_encoder'] = siblings_spouses_aboard_encoder

    # Parents/Children Aboard
    parents_children_aboard_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'parents_children_aboard_encoder.json'),
              'r', encoding='utf8', errors='ignore') as infile:
        parents_children_aboard_encoder.classes_ = json.load(infile)
    encoders['parents_children_aboard_encoder'] = parents_children_aboard_encoder

    # Fare
    fare_encoder = LabelBinarizer()
    fare_encoder.classes_ = list(range(10))

    with open(os.path.join('encoders', 'fare_bins.json'),
              'r', encoding='utf8', errors='ignore') as infile:
        fare_bins = json.load(infile)
    encoders['fare_bins'] = fare_bins
    encoders['fare_encoder'] = fare_encoder

    # Target Field: Survived
    survived_encoder = LabelEncoder()

    with open(os.path.join('encoders', 'survived_encoder.json'),
              'r', encoding='utf8', errors='ignore') as infile:
        survived_encoder.classes_ = np.array(json.load(infile))
    encoders['survived_encoder'] = survived_encoder

    return encoders
Пример #13
0
    def trawl(self, data):
        def getColumnsNum(chunks):

            if chunks is None:
                return 0
            else:
                return chunks.shape[1] - 1

        chunks = None  # I'll be making submatrices here that I'll stitch together at the very end.

        # Python doesn't have type matching as good as scala's... so here I am matching on a manually built map.
        for field in self.features:

            # If the field is a float field, then standardize the whole row.
            if self.feature_fields_map[field] == 'float64':
                column = [
                    x if len(x) > 0 else 0
                    for x in data[:, self.features.index(field)]
                ]  # Cleaning out empty vals
                chunk = scale(np.expand_dims(column, 1).astype('float64'))

                self.features_to_vector_idx[field] = getColumnsNum(
                    chunks)  # To let us know where in the vec a feature is

                if chunks is None:
                    chunks = chunk
                else:
                    chunks = np.concatenate((chunks, chunk), 1)

            # If the field is categorical, then replace the column with onehot encodings.
            elif self.feature_fields_map[field] == 'categorical':
                lb = LabelBinarizer()
                lb.fit(data[:, self.features.index(field)])
                lb.classes_ = sorted(lb.classes_)
                chunk = lb.transform(data[:, self.features.index(field)])

                self.features_to_vector_idx[field] = getColumnsNum(
                    chunks)  # To let us know where in the vec a feature is
                self.categoricals[
                    field] = lb.classes_  # To let us know what categories were encoded

                if chunks is None:
                    chunks = chunk
                else:
                    chunks = np.concatenate((chunks, chunk), 1)

            elif self.feature_fields_map[field] == 'indicator':

                double_column = [[x, 0] if len(x) > 0 else [0, 1]
                                 for x in data[:,
                                               self.features.index(field)]]
                double_column = np.array(double_column)
                indicators = double_column[:, 1]
                vals = scale(double_column[:, 0])

                if chunks is None:
                    chunks = np.concatenate(
                        (np.expand_dims(vals, 1), np.expand_dims(
                            indicators, 1)), 1)
                else:
                    chunks = np.concatenate((chunks, np.expand_dims(
                        vals, 1), np.expand_dims(indicators, 1)), 1)

        # Save the labelfield for last.
        labels = np.expand_dims(data[:, -1], 1)
        chunks = np.concatenate((chunks, labels), 1)

        # Putting together the full list of the new features we just transformed
        # These can later be aligned with weight vectors from models and whatnot
        # which is generally useful for model explainability
        headers = []
        for field in self.features:
            if self.feature_fields_map[field] == 'float64':
                headers.append(field)
            elif self.feature_fields_map[field] == 'categorical':
                categories = list(self.categoricals[field])
                if len(categories) > 2:
                    headers += [
                        field + ": " + category
                        for category in list(self.categoricals[field])
                    ]
                elif len(categories) == 2:
                    headers += [categories[0] + "/" + categories[1]]
                else:
                    headers += [categories[0]]
            elif self.feature_fields_map[field] == 'indicator':
                headers.append(field)
                headers.append(field + "_indicator")

        self.vector_headers = headers

        return chunks.astype('float64')
Пример #14
0
def load_encoders():
    """Loads the encoders built during `build_encoders`.

    # Returns
        encoders: A dict of encoder objects/specs.
    """

    encoders = {}

    # Unnamed: 0
    unnamed_0_encoder = LabelBinarizer()
    unnamed_0_encoder.classes_ = list(range(10))

    with open(os.path.join('encoders', 'unnamed_0_bins.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        unnamed_0_bins = json.load(infile)
    encoders['unnamed_0_bins'] = unnamed_0_bins
    encoders['unnamed_0_encoder'] = unnamed_0_encoder

    # Month
    month_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'month_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        month_encoder.classes_ = json.load(infile)
    encoders['month_encoder'] = month_encoder

    # Year
    year_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'year_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        year_encoder.classes_ = json.load(infile)
    encoders['year_encoder'] = year_encoder

    # sher
    sher_encoder = LabelBinarizer()
    sher_encoder.classes_ = list(range(10))

    with open(os.path.join('encoders', 'sher_bins.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        sher_bins = json.load(infile)
    encoders['sher_bins'] = sher_bins
    encoders['sher_encoder'] = sher_encoder

    # Target Field: Sales
    sales_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'sales_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        sales_encoder.classes_ = np.array(json.load(infile))
    encoders['sales_encoder'] = sales_encoder

    return encoders
def load_encoders():
    """Loads the encoders built during `build_encoders`.

    # Returns
        encoders: A dict of encoder objects/specs.
    """

    encoders = {}

    # make
    make_encoder = StandardScaler()
    make_encoder_attrs = ['mean_', 'var_', 'scale_']

    with open(os.path.join('encoders', 'make_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        make_attrs = json.load(infile)

    for attr, value in make_attrs.items():
        setattr(make_encoder, attr, value)
    encoders['make_encoder'] = make_encoder

    # body
    body_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'body_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        body_encoder.classes_ = json.load(infile)
    encoders['body_encoder'] = body_encoder

    # mileage
    mileage_encoder = StandardScaler()
    mileage_encoder_attrs = ['mean_', 'var_', 'scale_']

    with open(os.path.join('encoders', 'mileage_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        mileage_attrs = json.load(infile)

    for attr, value in mileage_attrs.items():
        setattr(mileage_encoder, attr, value)
    encoders['mileage_encoder'] = mileage_encoder

    # engV
    engv_encoder = StandardScaler()
    engv_encoder_attrs = ['mean_', 'var_', 'scale_']

    with open(os.path.join('encoders', 'engv_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        engv_attrs = json.load(infile)

    for attr, value in engv_attrs.items():
        setattr(engv_encoder, attr, value)
    encoders['engv_encoder'] = engv_encoder

    # registration
    registration_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'registration_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        registration_encoder.classes_ = json.load(infile)
    encoders['registration_encoder'] = registration_encoder

    # year
    year_encoder = StandardScaler()
    year_encoder_attrs = ['mean_', 'var_', 'scale_']

    with open(os.path.join('encoders', 'year_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        year_attrs = json.load(infile)

    for attr, value in year_attrs.items():
        setattr(year_encoder, attr, value)
    encoders['year_encoder'] = year_encoder

    # drive
    drive_encoder = LabelBinarizer()

    with open(os.path.join('encoders', 'drive_encoder.json'),
              'r',
              encoding='utf8',
              errors='ignore') as infile:
        drive_encoder.classes_ = json.load(infile)
    encoders['drive_encoder'] = drive_encoder

    # Target Field: price

    return encoders
Пример #16
0
def col2onehot_str(col, classes=None):
    lbb = LabelBinarizer()
    if classes:
        lbb.classes_ = classes
    onehotcodes = lbb.fit_transform(col) if not classes else lbb.transform(col)
    return onehotcodes, lbb