def reblog_prediction(client, id_num): id = int(id_num) #enter id number to start from on dashboard x = client.dashboard(reblog_info=True, since_id=id)[u'posts'] #load our saved model model = load_model('my_model.h5') # load tokenizer tokenizer = Tokenizer() with open('tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) encoder = LabelBinarizer() encoder.classes_ = ["reblog", "not on blog"] labels = np.array(["reblog", "not on blog"]) x_data = [] length = len(x) for i in range(0, length): str = clean_string(x[i]['blog_name'] + " " + x[i][u"summary"]) x_data.append(str) x_data_series = pd.Series(x_data) x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf') i=0 for x_t in x_tokenized: prediction = model.predict(np.array([x_t])) predicted_label = labels[np.argmax(prediction[0])] print("Summary: " + x_data[i] + "\nPredicted label: " + predicted_label) i += 1
def _fit_data(self, X): """Binarize the data for each column separately. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- X_transformed : array-like Returns the data where in each columns the labels are binarized. """ if self.binarize is not None: X = binarize(X, threshold=self.binarize) for i in range(X.shape[1]): # initialise binarizer and save binarizer = LabelBinarizer() if self.binarize: binarizer.classes_ = np.array([0, 1]) # fit the data to the binarizer binarizer.fit(X[:, i]) self._binarizers.append(binarizer) return self._transform_data(X)
def binarizer_from_classifier(clf): """ returns a LabelBinarizer with the position of each class corresponding to that of an input classifier """ lb = LabelBinarizer() lb.multilabel_ = False lb.classes_ = clf.classes_ return lb
def deserialize_label_binarizer(label_binarizer_dict): label_binarizer = LabelBinarizer() label_binarizer.neg_label = label_binarizer_dict['neg_label'] label_binarizer.pos_label = label_binarizer_dict['pos_label'] label_binarizer.sparse_output = label_binarizer_dict['sparse_output'] label_binarizer.y_type_ = label_binarizer_dict['y_type_'] label_binarizer.sparse_input_ = label_binarizer_dict['sparse_input_'] label_binarizer.classes_ = np.array(label_binarizer_dict['classes_']) return label_binarizer
def test_sklearn_preinit(self): m = np.array([1.0, .81, .85, .81, .85, .81]) u = np.array([1.0, .23, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) binarizer = LabelBinarizer() binarizer.classes_ = np.array([0, 1]) binarizer.transform(X_train.iloc[:, 1]) assert len(binarizer.classes_) == 2
def fit(self, X_): catCols = [ colname for colname in X_.columns if X_[colname].dtype == 'object' ] for col in self.drop: catCols.remove(col) self.catCols = catCols for col in catCols: encoder = RobustLabelEncoder() tmp = encoder.fit_transform(X_[col].tolist()) self.encoders[col] = encoder if self.onehot: oh_encoder = LabelBinarizer() #不训练 oh_encoder.classes_ = np.array(range(len(encoder.classes_))) #print(encoder.classes_) self.oh_encoders[col] = oh_encoder return self
def load_encoders(): """Loads the encoders built during `build_encoders`. # Returns encoders: A dict of encoder objects/specs. """ encoders = {} # Text tokenizer = CountVectorizer(max_features=10000) module = "webcompat_ml.models.invalid.encoders" filename = "model_vocab.json" with importlib_path(module, filename) as path: with open(path, "r", encoding="utf8", errors="ignore") as infile: tokenizer.vocabulary_ = json.load(infile) encoders["tokenizer"] = tokenizer # labels labels_encoder = LabelBinarizer() filename = "labels_encoder.json" with importlib_path(module, filename) as path: with open(path, "r", encoding="utf8", errors="ignore") as infile: labels_encoder.classes_ = json.load(infile) encoders["labels_encoder"] = labels_encoder # Target Field: invalid invalid_encoder = LabelEncoder() filename = "invalid_encoder.json" with importlib_path(module, filename) as path: with open(path, "r", encoding="utf8", errors="ignore") as infile: invalid_encoder.classes_ = np.array(json.load(infile)) encoders["invalid_encoder"] = invalid_encoder return encoders
def test_serialize_model(): instance = HostFootprint() model = MLPClassifier() label_binarizer = LabelBinarizer() label_binarizer.neg_label = 0 label_binarizer.pos_label = 1 label_binarizer.sparse_output = False label_binarizer.y_type_ = "binary" label_binarizer.sparse_input_ = False label_binarizer.classes_ = np.array([0]) parameters = {'hidden_layer_sizes': [(64, 32)]} GridSearchCV(model, parameters, cv=5, n_jobs=-1, scoring='f1_weighted') model.coefs_ = np.array([[1], [2]]) model.loss_ = 42 model.intercepts_ = np.array([[3], [4]]) model.classes_ = np.array([[5], [6]]) model.n_iter_ = 42 model.n_layers_ = 2 model.n_outputs_ = 1 model.out_activation_ = "logistic" model._label_binarizer = label_binarizer model.features = ['test_1', 'test_2', 'test_3'] with tempfile.TemporaryDirectory() as tmpdir: model_file = os.path.join(tmpdir, 'host_footprint.json') instance.serialize_model(model, model_file) new_model = instance.deserialize_model(model_file) assert model.features == new_model.features print(f"model params: {model.get_params()}") print(f"new_model params: {new_model.get_params()}") assert len(model.get_params()['hidden_layer_sizes']) == len( new_model.get_params()['hidden_layer_sizes']) assert model._label_binarizer.y_type_ == new_model._label_binarizer.y_type_ assert len(model.coefs_) == len(new_model.coefs_) assert len(model.intercepts_) == len(new_model.intercepts_)
multilabel_binarizer = MultiLabelBinarizer() multilabel_binarizer.fit(train_data.raw_labels) y_train = multilabel_binarizer.transform(train_data.raw_labels) y_test = multilabel_binarizer.transform(test_data.raw_labels) train_rfe = train_data[list(range(3, 30)) + ["read_num"]] test_rfe = test_data[list(range(3, 30)) + ["read_num"]] n_calsses = len(multilabel_binarizer.classes_) ind_ids = { ind_id: i for i, ind_id in enumerate(df_full.individual_id.unique()) } ind_id_binarizer = LabelBinarizer() #for efficiency we use our version of binarizer ind_id_binarizer.classes_ = np.array(list(ind_ids.keys())) # In[214]: timer.report_milestone("Preprocessing") # ------------Model----------- #Drop out level DPLVL = 0.02 DENSE_RELU_LAYERS = 6 #Dense layer size DNSS = 256 #Amount of groups [msid,individual_id (i.e. patient)] per batch BATCH_G_S = 300
def load_encoders(): """Loads the encoders built during `build_encoders`. # Returns encoders: A dict of encoder objects/specs. """ encoders = {} # make make_encoder = LabelBinarizer() make_encoder.classes_ = list(range(4)) with open(os.path.join('encoders', 'make_bins.json'), 'r', encoding='utf8', errors='ignore') as infile: make_bins = json.load(infile) encoders['make_bins'] = make_bins encoders['make_encoder'] = make_encoder # body body_encoder = LabelBinarizer() with open(os.path.join('encoders', 'body_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: body_encoder.classes_ = json.load(infile) encoders['body_encoder'] = body_encoder # mileage mileage_encoder = LabelBinarizer() mileage_encoder.classes_ = list(range(4)) with open(os.path.join('encoders', 'mileage_bins.json'), 'r', encoding='utf8', errors='ignore') as infile: mileage_bins = json.load(infile) encoders['mileage_bins'] = mileage_bins encoders['mileage_encoder'] = mileage_encoder # engV engv_encoder = LabelBinarizer() engv_encoder.classes_ = list(range(4)) with open(os.path.join('encoders', 'engv_bins.json'), 'r', encoding='utf8', errors='ignore') as infile: engv_bins = json.load(infile) encoders['engv_bins'] = engv_bins encoders['engv_encoder'] = engv_encoder # engType engtype_encoder = LabelBinarizer() with open(os.path.join('encoders', 'engtype_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: engtype_encoder.classes_ = json.load(infile) encoders['engtype_encoder'] = engtype_encoder # registration registration_encoder = LabelBinarizer() with open(os.path.join('encoders', 'registration_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: registration_encoder.classes_ = json.load(infile) encoders['registration_encoder'] = registration_encoder # year year_encoder = LabelBinarizer() year_encoder.classes_ = list(range(4)) with open(os.path.join('encoders', 'year_bins.json'), 'r', encoding='utf8', errors='ignore') as infile: year_bins = json.load(infile) encoders['year_bins'] = year_bins encoders['year_encoder'] = year_encoder # drive drive_encoder = LabelBinarizer() with open(os.path.join('encoders', 'drive_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: drive_encoder.classes_ = json.load(infile) encoders['drive_encoder'] = drive_encoder # Target Field: price return encoders
def load_encoders(): """Loads the encoders built during `build_encoders`. # Returns encoders: A dict of encoder objects/specs. """ encoders = {} # Pclass pclass_encoder = LabelBinarizer() with open(os.path.join('encoders', 'pclass_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: pclass_encoder.classes_ = json.load(infile) encoders['pclass_encoder'] = pclass_encoder # Sex sex_encoder = LabelBinarizer() with open(os.path.join('encoders', 'sex_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: sex_encoder.classes_ = json.load(infile) encoders['sex_encoder'] = sex_encoder # Age age_encoder = LabelBinarizer() age_encoder.classes_ = list(range(10)) with open(os.path.join('encoders', 'age_bins.json'), 'r', encoding='utf8', errors='ignore') as infile: age_bins = json.load(infile) encoders['age_bins'] = age_bins encoders['age_encoder'] = age_encoder # Siblings/Spouses Aboard siblings_spouses_aboard_encoder = LabelBinarizer() with open(os.path.join('encoders', 'siblings_spouses_aboard_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: siblings_spouses_aboard_encoder.classes_ = json.load(infile) encoders['siblings_spouses_aboard_encoder'] = siblings_spouses_aboard_encoder # Parents/Children Aboard parents_children_aboard_encoder = LabelBinarizer() with open(os.path.join('encoders', 'parents_children_aboard_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: parents_children_aboard_encoder.classes_ = json.load(infile) encoders['parents_children_aboard_encoder'] = parents_children_aboard_encoder # Fare fare_encoder = LabelBinarizer() fare_encoder.classes_ = list(range(10)) with open(os.path.join('encoders', 'fare_bins.json'), 'r', encoding='utf8', errors='ignore') as infile: fare_bins = json.load(infile) encoders['fare_bins'] = fare_bins encoders['fare_encoder'] = fare_encoder # Target Field: Survived survived_encoder = LabelEncoder() with open(os.path.join('encoders', 'survived_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: survived_encoder.classes_ = np.array(json.load(infile)) encoders['survived_encoder'] = survived_encoder return encoders
def trawl(self, data): def getColumnsNum(chunks): if chunks is None: return 0 else: return chunks.shape[1] - 1 chunks = None # I'll be making submatrices here that I'll stitch together at the very end. # Python doesn't have type matching as good as scala's... so here I am matching on a manually built map. for field in self.features: # If the field is a float field, then standardize the whole row. if self.feature_fields_map[field] == 'float64': column = [ x if len(x) > 0 else 0 for x in data[:, self.features.index(field)] ] # Cleaning out empty vals chunk = scale(np.expand_dims(column, 1).astype('float64')) self.features_to_vector_idx[field] = getColumnsNum( chunks) # To let us know where in the vec a feature is if chunks is None: chunks = chunk else: chunks = np.concatenate((chunks, chunk), 1) # If the field is categorical, then replace the column with onehot encodings. elif self.feature_fields_map[field] == 'categorical': lb = LabelBinarizer() lb.fit(data[:, self.features.index(field)]) lb.classes_ = sorted(lb.classes_) chunk = lb.transform(data[:, self.features.index(field)]) self.features_to_vector_idx[field] = getColumnsNum( chunks) # To let us know where in the vec a feature is self.categoricals[ field] = lb.classes_ # To let us know what categories were encoded if chunks is None: chunks = chunk else: chunks = np.concatenate((chunks, chunk), 1) elif self.feature_fields_map[field] == 'indicator': double_column = [[x, 0] if len(x) > 0 else [0, 1] for x in data[:, self.features.index(field)]] double_column = np.array(double_column) indicators = double_column[:, 1] vals = scale(double_column[:, 0]) if chunks is None: chunks = np.concatenate( (np.expand_dims(vals, 1), np.expand_dims( indicators, 1)), 1) else: chunks = np.concatenate((chunks, np.expand_dims( vals, 1), np.expand_dims(indicators, 1)), 1) # Save the labelfield for last. labels = np.expand_dims(data[:, -1], 1) chunks = np.concatenate((chunks, labels), 1) # Putting together the full list of the new features we just transformed # These can later be aligned with weight vectors from models and whatnot # which is generally useful for model explainability headers = [] for field in self.features: if self.feature_fields_map[field] == 'float64': headers.append(field) elif self.feature_fields_map[field] == 'categorical': categories = list(self.categoricals[field]) if len(categories) > 2: headers += [ field + ": " + category for category in list(self.categoricals[field]) ] elif len(categories) == 2: headers += [categories[0] + "/" + categories[1]] else: headers += [categories[0]] elif self.feature_fields_map[field] == 'indicator': headers.append(field) headers.append(field + "_indicator") self.vector_headers = headers return chunks.astype('float64')
def load_encoders(): """Loads the encoders built during `build_encoders`. # Returns encoders: A dict of encoder objects/specs. """ encoders = {} # Unnamed: 0 unnamed_0_encoder = LabelBinarizer() unnamed_0_encoder.classes_ = list(range(10)) with open(os.path.join('encoders', 'unnamed_0_bins.json'), 'r', encoding='utf8', errors='ignore') as infile: unnamed_0_bins = json.load(infile) encoders['unnamed_0_bins'] = unnamed_0_bins encoders['unnamed_0_encoder'] = unnamed_0_encoder # Month month_encoder = LabelBinarizer() with open(os.path.join('encoders', 'month_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: month_encoder.classes_ = json.load(infile) encoders['month_encoder'] = month_encoder # Year year_encoder = LabelBinarizer() with open(os.path.join('encoders', 'year_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: year_encoder.classes_ = json.load(infile) encoders['year_encoder'] = year_encoder # sher sher_encoder = LabelBinarizer() sher_encoder.classes_ = list(range(10)) with open(os.path.join('encoders', 'sher_bins.json'), 'r', encoding='utf8', errors='ignore') as infile: sher_bins = json.load(infile) encoders['sher_bins'] = sher_bins encoders['sher_encoder'] = sher_encoder # Target Field: Sales sales_encoder = LabelBinarizer() with open(os.path.join('encoders', 'sales_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: sales_encoder.classes_ = np.array(json.load(infile)) encoders['sales_encoder'] = sales_encoder return encoders
def load_encoders(): """Loads the encoders built during `build_encoders`. # Returns encoders: A dict of encoder objects/specs. """ encoders = {} # make make_encoder = StandardScaler() make_encoder_attrs = ['mean_', 'var_', 'scale_'] with open(os.path.join('encoders', 'make_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: make_attrs = json.load(infile) for attr, value in make_attrs.items(): setattr(make_encoder, attr, value) encoders['make_encoder'] = make_encoder # body body_encoder = LabelBinarizer() with open(os.path.join('encoders', 'body_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: body_encoder.classes_ = json.load(infile) encoders['body_encoder'] = body_encoder # mileage mileage_encoder = StandardScaler() mileage_encoder_attrs = ['mean_', 'var_', 'scale_'] with open(os.path.join('encoders', 'mileage_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: mileage_attrs = json.load(infile) for attr, value in mileage_attrs.items(): setattr(mileage_encoder, attr, value) encoders['mileage_encoder'] = mileage_encoder # engV engv_encoder = StandardScaler() engv_encoder_attrs = ['mean_', 'var_', 'scale_'] with open(os.path.join('encoders', 'engv_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: engv_attrs = json.load(infile) for attr, value in engv_attrs.items(): setattr(engv_encoder, attr, value) encoders['engv_encoder'] = engv_encoder # registration registration_encoder = LabelBinarizer() with open(os.path.join('encoders', 'registration_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: registration_encoder.classes_ = json.load(infile) encoders['registration_encoder'] = registration_encoder # year year_encoder = StandardScaler() year_encoder_attrs = ['mean_', 'var_', 'scale_'] with open(os.path.join('encoders', 'year_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: year_attrs = json.load(infile) for attr, value in year_attrs.items(): setattr(year_encoder, attr, value) encoders['year_encoder'] = year_encoder # drive drive_encoder = LabelBinarizer() with open(os.path.join('encoders', 'drive_encoder.json'), 'r', encoding='utf8', errors='ignore') as infile: drive_encoder.classes_ = json.load(infile) encoders['drive_encoder'] = drive_encoder # Target Field: price return encoders
def col2onehot_str(col, classes=None): lbb = LabelBinarizer() if classes: lbb.classes_ = classes onehotcodes = lbb.fit_transform(col) if not classes else lbb.transform(col) return onehotcodes, lbb