X_train = numpy.zeros((train_size, (max_words + num_tags)), dtype='str') X_test = numpy.zeros((test_size, (max_words + num_tags)), dtype='str') # Integer encode documents X_train[:, 0:max_words] = tweet_token.texts_to_matrix(X[0:train_size, 0], mode='count') X_train[:, max_words:(max_words + num_tags)] = tag_token.texts_to_matrix( X[0:train_size, 1], mode='count') X_test[:, 0:max_words] = tweet_token.texts_to_matrix(X[train_size:data_size, 0], mode='count') X_test[:, max_words:(max_words + num_tags)] = tag_token.texts_to_matrix( X[train_size:data_size, 1], mode='count') # Preprocess labels encoder = preprocessing.LabelBinarizer() Y = encoder.fit_transform(y) Y = np_utils.to_categorical(Y, num_classes=2, dtype='float32') Y_train = Y[0:train_size, :] Y_test = Y[train_size:data_size, :] # print(Y_train.shape) # print(Y_test.shape) # print(Y) filename = ".\\Logs\\" + startTime + "_tweet_training_data.csv" # Setup Callbacks callbackList = [ keras.callbacks.CSVLogger(filename, separator=',', append=True), keras.callbacks.ModelCheckpoint(".\\Models\\" + startTime +
with open("./fonts.dat", 'rb') as f: ind = pickle.load(f, encoding="bytes") X = ind[b"data"] X = X.reshape(-1, X.shape[1] * X.shape[2]) Y = ind[b"target"] fonts_name = ind[b"letter"] x_train, x_test, y_train_label, y_test_label = train_test_split(X, Y, test_size=0.2) x_train = x_train.astype(np.float32) x_test = x_test.astype(np.float32) y_train = preprocessing.LabelBinarizer().fit_transform(y_train_label) y_test = preprocessing.LabelBinarizer().fit_transform(y_test_label) from common.multi_layer_net import MultiLayerNet from common.optimizer import * weight_decay_lambda = 0 network = MultiLayerNet(input_size=2304, hidden_size_list=[100, 100], output_size=48, activation="sigmoid", weight_decay_lambda=weight_decay_lambda) optimizer = Momentum(lr=0.0045) max_epochs = 14 train_size = x_train.shape[0]
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') #target_names = column_mapping.get('target_names') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) #target_names = None if production_data is not None and target_column is not None and prediction_column is not None: production_data.replace([np.inf, -np.inf], np.nan, inplace=True) production_data.dropna(axis=0, how='any', inplace=True) if len(prediction_column) <= 2: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(production_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(production_data[target_column])) binaraized_target.columns = ['target'] fpr, tpr, thrs = metrics.roc_curve( binaraized_target, production_data[prediction_column[0]]) #problem!!! fig = go.Figure() fig.add_trace( go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC', marker=dict( size=6, color=red, ))) fig.update_layout(yaxis_title="True Positive Rate", xaxis_title="False Positive Rate", showlegend=True) fig_json = json.loads(fig.to_json()) self.wi = BaseWidgetInfo( title=self.title, type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={ "data": fig_json['data'], "layout": fig_json['layout'] }, additionalGraphs=[], ) else: binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(production_data[target_column]) binaraized_target = pd.DataFrame( binaraizer.transform(production_data[target_column])) binaraized_target.columns = prediction_column #plot support bar graphs = [] for label in prediction_column: fpr, tpr, thrs = metrics.roc_curve( binaraized_target[label], production_data[label]) fig = go.Figure() fig.add_trace( go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC', marker=dict( size=6, color=red, ))) fig.update_layout(yaxis_title="True Positive Rate", xaxis_title="False Positive Rate", showlegend=True) fig_json = json.loads(fig.to_json()) graphs.append({ "id": "tab_" + str(label), "title": str(label), "graph": { "data": fig_json["data"], "layout": fig_json["layout"], } }) self.wi = BaseWidgetInfo( title=self.title, type="tabbed_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=1, params={"graphs": graphs}, additionalGraphs=[], ) else: self.wi = None
def label_binarizer(target): lb = preprocessing.LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False) result = lb.fit_transform(target) return result
def one_hot(series): label_binarizer = pp.LabelBinarizer() label_binarizer.fit(range(max(series) + 1)) return label_binarizer.transform(series)
def trainSimpleDNNObj(self,dataObj,fileName,tensorboardLogFolder,lossType,listOfMetrics,\ problemType,optimizerName,learningRate,datHyperPara,testSize,scriptObj): print(">>>>>>>>>>>>>>SimpleDNN") print('pathofdata>>>>>', self.pathOfData) predictedClass = None targetColumnName = 'target' df = dataObj indevar = list(df.columns) indevar.remove('target') targetCol = df[targetColumnName] if problemType == 'classification': lb = preprocessing.LabelBinarizer() y = lb.fit_transform(targetCol) predictedClass = list(targetCol.unique()) else: y = df[targetColumnName] predictedClass = None ##### Split data into test and validation set for training################################# trainDataX, testDataX, trainDataY, testDataY = model_selection.train_test_split( df[indevar], y, test_size=datHyperPara['testSize']) stepsPerEpochT = int(len(trainDataX) / datHyperPara['batchSize']) stepsPerEpochV = int(len(testDataX) / datHyperPara['batchSize']) kerasUtilities.updateStatusOfTraining( self.statusFile, 'Data split in Train validation part') modelObj = self.generateAndCompileModel(lossType, optimizerName, learningRate, listOfMetrics) if modelObj.__class__.__name__ == 'dict': return model = modelObj.model tensor_board = self.startTensorBoard(tensorboardLogFolder) ##### Train model################################# kerasUtilities.updateStatusOfTraining(self.statusFile, 'Training Started') try: import tensorflow as tf with tf.device(gpuCPUSelect(selDev)): model.fit(x=trainDataX, y=trainDataY, epochs=datHyperPara['epoch'], callbacks=[tensor_board],\ validation_data=(testDataX, testDataY), steps_per_epoch=stepsPerEpochT, validation_steps=stepsPerEpochV) except Exception as e: data_details = self.upDateStatus() self.updateStatusWithError( data_details, 'Training Failed', 'Error while fitting data to Keras Model >> ' + str(e), traceback.format_exc(), self.statusFile) return kerasUtilities.updateStatusOfTraining(self.statusFile, 'Training Completed') try: toExportDict = { 'model1': { 'data': self.pathOfData, 'hyperparameters': datHyperPara, 'preProcessingScript': scriptObj, 'pipelineObj': None, 'modelObj': model, 'featuresUsed': indevar, 'targetName': 'target', 'postProcessingScript': None, 'taskType': 'trainAndscore', 'predictedClasses': predictedClass, 'dataSet': None } } from nyoka.skl.skl_to_pmml import model_to_pmml model_to_pmml(toExportDict, PMMLFileName=fileName) kerasUtilities.updateStatusOfTraining( self.statusFile, 'PMML file Successfully Saved') return 'Success' except Exception as e: data_details = self.upDateStatus() self.updateStatusWithError(data_details, 'Saving File Failed', ' ' + str(e), traceback.format_exc(), self.statusFile) return -1
def load_compas_data(): FEATURES_CLASSIFICATION = ["age_cat", "race", "sex", "priors_count", "c_charge_degree"] #features to be used for classification CONT_VARIABLES = ["priors_count"] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot CLASS_FEATURE = "two_year_recid" # the decision variable SENSITIVE_ATTRS = ["race"] COMPAS_INPUT_FILE = "compas-scores-two-years.csv" check_data_file(COMPAS_INPUT_FILE) # load the data and get some stats df = pd.read_csv(COMPAS_INPUT_FILE) df = df.dropna(subset=["days_b_screening_arrest"]) # dropping missing vals # convert to np array data = df.to_dict('list') for k in data.keys(): data[k] = np.array(data[k]) """ Filtering the data """ # These filters are the same as propublica (refer to https://github.com/propublica/compas-analysis) # If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense. idx = np.logical_and(data["days_b_screening_arrest"]<=30, data["days_b_screening_arrest"]>=-30) # We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all. idx = np.logical_and(idx, data["is_recid"] != -1) # In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed (only two of them). idx = np.logical_and(idx, data["c_charge_degree"] != "O") # F: felony, M: misconduct # We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility. idx = np.logical_and(idx, data["score_text"] != "NA") # we will only consider blacks and whites for this analysis idx = np.logical_and(idx, np.logical_or(data["race"] == "African-American", data["race"] == "Caucasian")) # select the examples that satisfy this criteria for k in data.keys(): data[k] = data[k][idx] """ Feature normalization and one hot encoding """ # convert class label 0 to -1 y = data[CLASS_FEATURE] y[y==0] = -1 print "\nNumber of people recidivating within two years" print pd.Series(y).value_counts() print "\n" X = np.array([]).reshape(len(y), 0) # empty array with num rows same as num examples, will hstack the features to it x_control = defaultdict(list) feature_names = [] for attr in FEATURES_CLASSIFICATION: vals = data[attr] if attr in CONT_VARIABLES: vals = [float(v) for v in vals] vals = preprocessing.scale(vals) # 0 mean and 1 variance vals = np.reshape(vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col else: # for binary categorical variables, the label binarizer uses just one var instead of two lb = preprocessing.LabelBinarizer() lb.fit(vals) vals = lb.transform(vals) # add to sensitive features dict if attr in SENSITIVE_ATTRS: x_control[attr] = vals # add to learnable features X = np.hstack((X, vals)) if attr in CONT_VARIABLES: # continuous feature, just append the name feature_names.append(attr) else: # categorical features if vals.shape[1] == 1: # binary features that passed through lib binarizer feature_names.append(attr) else: for k in lb.classes_: # non-binary categorical features, need to add the names for each cat feature_names.append(attr + "_" + str(k)) # convert the sensitive feature to 1-d array x_control = dict(x_control) for k in x_control.keys(): assert(x_control[k].shape[1] == 1) # make sure that the sensitive feature is binary after one hot encoding x_control[k] = np.array(x_control[k]).flatten() # sys.exit(1) """permute the date randomly""" perm = range(0,X.shape[0]) shuffle(perm) X = X[perm] y = y[perm] for k in x_control.keys(): x_control[k] = x_control[k][perm] # X = ut.add_intercept(X) X = np.c_[np.ones(len(X)), X] feature_names = ["intercept"] + feature_names assert(len(feature_names) == X.shape[1]) print "Features we will be using for classification are:", feature_names, "\n" return X, y, x_control
pyplot.show() #preprocessing dataset2 # merge the train and test set because it is unsupervised learning dataset2 = pd.read_csv("./DATASET/BANK/MT_Train.csv") dataset2.drop('default', axis=1, inplace=True) le = LabelEncoder() var_to_encode = [ 'job', 'marital', 'education', 'day_of_week', 'month', 'housing', 'loan', 'poutcome' ] for col in var_to_encode: dataset2[col] = le.fit_transform(dataset2[col]) dataset2["contact"] = preprocessing.LabelBinarizer().fit_transform( dataset2["contact"]) dataset2[["pdays"]] = preprocessing.Binarizer(threshold=998).transform( dataset2[["pdays"]]) dataset2["y"] = preprocessing.LabelBinarizer().fit_transform(dataset2["y"]) x2 = dataset2[dataset2.columns.drop('y')] y2 = list(dataset2['y']) scaler = StandardScaler() scaler.fit(x2) x2_n = scaler.transform(x2) #clustering without scaling gmm = GaussianMixture(n_components=2, max_iter=100, random_state=RAND).fit(x2) labels = gmm.predict(x2) #visualization in 2d needs the dimension reduction to 2d thus we reduce it in PCA feature_sort = PCA(n_components=5).fit(x2)
import matplotlib.pyplot as plt import keras.datasets as keras_datasets import keras.utils as keras_utils import keras.models as keras_models import keras.layers as keras_layers iris_dataset = pd.read_csv("data/iris.csv").sample(frac=1) column_names = iris_dataset.columns.tolist() X = iris_dataset[column_names[:-1]] y = iris_dataset[column_names[-1]] min_max_scaler = sk_preprocessing.MinMaxScaler() X = min_max_scaler.fit_transform(X) one_hot_encoder = sk_preprocessing.LabelBinarizer() y = one_hot_encoder.fit_transform(y) for hidden_neurons in [1, 2, 4, 8, 16]: ANN_model = keras_models.Sequential() ANN_model.add( keras_layers.Dense(hidden_neurons, activation="relu", input_shape=(4, ))) ANN_model.add(keras_layers.Dense(3, activation="softmax")) ANN_model.compile(loss="mean_squared_error", optimizer="adam",
def predict(): dat = flask.request.form['date'] time = flask.request.form['time'] if str(dat) in holidays_tt: holiday = 1 else: holiday = 0 response = requests.get(url).json() temp = float(response["main"]["temp"]) - 273.15 temp_min = float(response["main"]["temp_min"]) - 283.15 temp_max = float(response["main"]["temp_max"]) - 273.15 pressure = response["main"]["pressure"] humidity = response["main"]["humidity"] #week date_time_obj = datetime.datetime.strptime(dat, '%Y-%m-%d') week = datetime.date(date_time_obj.year, date_time_obj.month, date_time_obj.day).isocalendar()[1] #hour hour = int(time[:-3]) #population dic = { "HSR Division": 105265, "Koramangala Division": 63987, "Indiranagar": 58830, "Shivajinagar": 57437, "Hebbal": 54301, "Whitefield": 84428, "Malleshwaram": 57107, "Rajaji Nagara Division": 55250, "Jayanagar": 56658, "Jalahalli": 63391, "Kengeri Division": 68087, "R R NAGAR": 82848, "Vidhanasoudha": 69057, "Peenya Division": 96549 } lb = preprocessing.LabelBinarizer() lb.fit([ 'HSR Division', 'Koramangala Division', 'Indiranagar', 'Shivajinagar', 'Hebbal', 'Whitefield', 'Malleshwaram', 'Rajaji Nagara Division', 'Jayanagar', 'Jalahalli', 'Kengeri Division', 'R R NAGAR', 'Vidhanasoudha', 'Peenya Division' ]) lt = list(dic.keys()) df = pd.DataFrame(lt) divs = lb.transform(df) divs = pd.DataFrame(divs) week = [week] * 14 temp_max = [temp_max] * 14 temp_min = [temp_min] * 14 holiday = [holiday] * 14 divs = pd.concat([pd.DataFrame(temp_max), divs], axis=1) divs = pd.concat([pd.DataFrame(temp_min), divs], axis=1) divs = pd.concat([pd.DataFrame(week), divs], axis=1) divs = pd.concat([divs, pd.DataFrame(holiday)], axis=1) pop = [dic[x] for x in lt] #population divs = pd.concat([divs, pd.DataFrame(pop)], axis=1) hour = [hour] * 14 divs = pd.concat([divs, pd.DataFrame(hour)], axis=1) from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() divs = sc_X.fit_transform(divs) with graph.as_default(): prd = model.predict(divs) newprd = prd.tolist() #return to webpage return flask.render_template("index.html", data=newprd)
#Adapted from : https://docs.python.org/2/library/gzip.html with gzip.open('data/train-images-idx3-ubyte.gz', 'rb') as f: train_img = f.read() with gzip.open('data/train-labels-idx1-ubyte.gz', 'rb') as f: train_lbl = f.read() # read in all images and labels into memory train_img = ~np.array(list(train_img[16:])).reshape(60000, 28, 28).astype( np.uint8) / 255.0 train_lbl = np.array(list(train_lbl[8:])).astype(np.uint8) # Flatten the array so the inputs can be mapped to the input neurons inputs = train_img.reshape(60000, 784) # encode the labels into binary format encoder = pre.LabelBinarizer() # get the size of the array needed for each category encoder.fit(train_lbl) # encode each label to be used as binary outputs outputs = encoder.transform(train_lbl) # print out the integer value and the new representation of the number print(train_lbl[0], outputs[0]) # print out each array for i in range(10): print(i, encoder.transform([i]))
def get_onehot_encoded(class_ids): assert len(np.unique( class_ids)) > 1, "You need more than one class in the class label" lb = preprocessing.LabelBinarizer() train_lbl = lb.fit_transform(class_ids) return train_lbl, lb.classes_
img = image.load_img(img_path, target_size=(HEIGHT, WIDTH)) # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3) x = image.img_to_array(img) # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor return np.expand_dims(x, axis=0) def paths_to_tensor(prefix, img_paths): list_of_tensors = [path_to_tensor(prefix + img_path) for img_path in tqdm(img_paths)] return preprocess_input(np.vstack(list_of_tensors)) X_train = paths_to_tensor("train/", np.array(train['file'])) X_valid = paths_to_tensor("train/", np.array(valid['file'])) labels = sorted(train['label'].unique()) one_hot_encoding = preprocessing.LabelBinarizer() one_hot_encoding.fit(labels) y_train = one_hot_encoding.transform(np.array(train['label'])) y_valid = one_hot_encoding.transform(np.array(valid['label'])) model = Sequential() model.add(ResNet50(weights='imagenet', include_top=False, input_shape=(HEIGHT, WIDTH, 3))) for layer in model.layers: layer.trainable = False model.add(GlobalAveragePooling2D()) model.add(Dense(1024, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(len(labels), activation='softmax'))
def fit(self, values, n_total=None, **kwargs): """Fit vectorizer to the provided data Parameters ---------- values : array-like, [n_samples] Strings for fitting the vectorizer. n_total : int or None, optional (default=None) Total Number of documents that values are extracted from. If None, defaults to ``len(values)``. **kwargs: Ignored Keyword arguments. Returns ------- self or None Returns None if `values` only includes one unique item, otherwise returns `self`. """ values = [value.lower() for value in values] if n_total is None: n_total = len(values) if isinstance(self.n_average, float): n_average = self.n_average * n_total else: n_average = float(self.n_average) params = copy.copy(self.params) if isinstance(params['min_df'], float): params['min_df'] = max(int(params['min_df'] * n_total), 1) else: params['min_df'] = params['min_df'] unique_values = set(values) max_categories = min(self.max_categories, len(values) / n_average) if len(unique_values) <= 1: return None elif len(unique_values) <= max_categories: # Categorization self._categorical = True self._vectorizer = preprocessing.LabelBinarizer(sparse_output=True) self._vectorizer.fit(values) if self._vectorizer.y_type_ == 'binary': self.feature_names_ = [ u'= {1} (!= {0})'.format(*self._vectorizer.classes_) ] else: self.feature_names_ = [ u'= {}'.format(category) for category in self._vectorizer.classes_ ] else: # Tokenization self._categorical = False self._vectorizer = feature_extraction.text.CountVectorizer( binary=True, dtype=bool, **params) try: self._vectorizer.fit(values) except ValueError: return None self.feature_names_ = [ u'has token "{}"'.format(feature_name) for feature_name in self._vectorizer.get_feature_names() ] if hasattr(self._vectorizer, 'stop_words_'): delattr(self._vectorizer, 'stop_words_') return self
def __init__(self, filename): self.char_seq = list(self.load_text(filename)) self.label_encoder = preprocessing.LabelBinarizer() self.enc_text = self.label_encoder.fit_transform(self.char_seq) self.tot_chars, self.num_classes = self.enc_text.shape
print(dataframe[~dataframe["feature_1"].isnull()]) #插入缺失值 features, _ = make_blobs(n_samples=1000, n_features=2, random_state=1) scaler = preprocessing.StandardScaler() sd_features = scaler.fit_transform(features) true_value = sd_features[0, 0] sd_features[0, 0] = np.nan mean_imputer = preprocessing.Imputer(strategy='mean', axis=0) features_mean_imputed = mean_imputer.fit_transform(features) print("true value:", true_value) print("imputed value:", features_mean_imputed[0, 0]) #one-hot feature = np.array([["Texas"], ["California"], ["Texas"], ["Delaware"]]) one_hot = preprocessing.LabelBinarizer() one_hot_re = one_hot.fit_transform(feature) print(one_hot_re) print(one_hot.classes_) print(one_hot.inverse_transform(one_hot.transform(feature))) print(pd.get_dummies(feature[:, 0])) multiclass_feature = [("Texas", "Florida"), ("California", "Alabama"), ("Texas", "Alabama")] one_hot_multi = preprocessing.MultiLabelBinarizer() print(one_hot_multi.fit_transform(multiclass_feature)) dataframe = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]}) scaler_mapper = {"Low": 1, "Medium": 2, "High": 3} print(dataframe["Score"].replace(scaler_mapper)) #字典特征
for item in already_numerical: data[item] = total_dframe[item] # In[274]: for item in ordered: # enumerate keys d = ascending(ordered[item]) data[item] = list(map(lambda x: d[x], total_dframe[item])) # In[275]: lable_binarizer = dict() for item in unordered: lb = pre.LabelBinarizer() lable_binarizer[item] = lb # save for later decoding new_columns = lb.fit_transform(total_dframe[item]) for class_, content in zip(lb.classes_, new_columns.T): data[item + " " + str(class_)] = content # In[276]: # add inflation as an attribute # usa_inflation = pd.DataFrame.from_csv("inflation_usa.csv") # times = list(usa_inflation['TIME']) # values = np.array(usa_inflation['Value']) # in percent per month # values = values/100/20+1 # scaled to match with http://www.wolframalpha.com/input/?i=total+inflation+from+2005+to+2010+in+usa # total_inflation = np.zeros(len(values))
Total parameters = 4*4 + 4*3 + 4 + 3 = 16 + 12 + 6 = 35 """ input_neurons = X.shape[1] hidden_neurons = 4 output_neurons = np.unique(Y).shape[0] tot_genes = ( hidden_neurons * input_neurons + output_neurons * hidden_neurons + hidden_neurons + output_neurons ) params = intial_pop(input_neurons, hidden_neurons, output_neurons, pop_size) # Convert labels to form of binary form encoder = pp.LabelBinarizer() Y = encoder.fit_transform(Y) # shuffle the input and output classes X_shuffle, Y_shuffle = utils.shuffle(X, Y) """ The number of iterations is 100. The crossover used is one-point crossover and mutation is performed on the offsprings generated from the crossover. Top 4 individuals are passed as it is to the next population and the rest 26 (26 offsprings - 13 crossover - 26 parents) are generated using crossover and mutation. The mutation probability = 1/(iterationnum + 10) The crossover probability = 0.8 (always perform crossover) """ num_iterations = 1000 mut_rate = 0 cross_rate = 0.8 # stores the average loss loss_avg = [float(x) for x in range(num_iterations)]
def MLP(name, input_dir, best_dir, output): if not os.path.exists(best_dir): os.makedirs(best_dir) best_dir_dat = "/".join((best_dir, name)) if not os.path.exists(best_dir_dat): os.makedirs(best_dir_dat) colnames = "HType,ABType,dimension,learnFac,margin,constr,LType,MLP_acc,MLP_wF1,MLP_epoch" with open(output, "w") as file: file.write(colnames) file.write("\n") models = sorted(os.listdir(input_dir)) for model in models: modelpath = "/".join((input_dir, model)) files = sorted(os.listdir(modelpath)) # create model subdir to store best MLP models best_subdir = "/".join((best_dir_dat, model)) if not os.path.exists(best_subdir): os.makedirs(best_subdir) for i, file in enumerate(files): print(i) # embedding datasets labelpath = "/".join((modelpath, file)) dataset = pd.read_csv(labelpath, index_col=0) # specify file path to store best MLP model [for later] filepath = best_subdir + "/" + file[:-4] + ".hdf5" ################################################################################ ############################# DATA SPLIT ############################## ################################################################################ lb = preprocessing.LabelBinarizer() lb.fit(list(dataset["pgroup"])) X_train = dataset[dataset["split"] == "LRN"].iloc[:,1:-2].values y_train = dataset[dataset["split"] == "LRN"].iloc[:,-1].values # get weights first weights = compute_class_weight("balanced", np.unique(y_train), y_train) # then transform y_train = lb.transform(y_train) X_valid = dataset[dataset["split"] == "VLD"].iloc[:,1:-2].values y_valid = dataset[dataset["split"] == "VLD"].iloc[:,-1].values y_valid = lb.transform(y_valid) X_test = dataset[dataset["split"] == "TST"].iloc[:,1:-2].values y_test = dataset[dataset["split"] == "TST"].iloc[:,-1].values y_test = lb.transform(y_test) ################################################################################ ############################# CLASSIFIER STRUCTURE ############################## ################################################################################ classifier = Sequential() dim = len(dataset.iloc[0,1:-2]) nodes = dim*2 # Hidden layer classifier.add(Dense(nodes, activation="sigmoid", kernel_initializer="uniform", input_dim=dim)) # Output layer classifier.add(Dense(8, activation="softmax", kernel_initializer="uniform")) # compile the model sgd = optimizers.SGD(lr=0.01, decay=0.0, momentum=0.0, nesterov=False) classifier.compile(optimizer=sgd, loss="categorical_crossentropy", metrics=["accuracy"]) ################################################################################ ############################# MODEL FITTING ############################## ################################################################################ # checkpoint best model checkpoint = ModelCheckpoint(filepath, monitor="val_acc", verbose=0, save_best_only=True, mode="auto") # model settings and fit history = classifier.fit(X_train, y_train, validation_data=(X_valid, \ y_valid), epochs=5000, verbose=0, callbacks=[checkpoint], \ class_weight=weights) ################################################################################ ############################# MAKE PREDICTIONS ############################## ################################################################################ #load best model final_model = load_model(filepath) # get accuracy scores = final_model.evaluate(X_test, y_test, verbose=0) # get weighted F1-by-class le = preprocessing.LabelEncoder() le.fit(list(dataset["pgroup"])) y_test2 = dataset[dataset["split"] == "TST"].iloc[:,-1].values y_test2 = le.transform(y_test2) y_pred = final_model.predict_classes(X_test, verbose=0) weighted_f1 = f1_score(y_test2, y_pred, average="weighted") # get best epoch acc_history = history.history["val_acc"] best_epoch = acc_history.index(max(acc_history)) + 1 K.clear_session() # destroy TF graph to avoid loop slowing down ################################################################################ ############################# ASSEMBLE W/ CONFIG ############################## ################################################################################ # get model type (H1-4, A/B) modelType = model.split("-")[1] # ["H1A"] HType = modelType[0:2] ABType = modelType[-1] # get dimension filenamesplit = file.split("-") dimension = int([s for s in filenamesplit if "D00" in s][0][1:]) # get learnFac learnFac = int([s for s in filenamesplit if "LF0" in s][0][3:]) # get margin margin = float([s for s in filenamesplit if "LM" in s][0][2:]) # get constraint constr = [s for s in filenamesplit if "_VALUE" in s][0][:-6].lower() # get LType LType = filenamesplit[-1][:2] with open(output, "a") as file: file.write("%s,%s,%d,%d,%.1f,%s,%s,%.17f,%.17f,%d" % (HType, ABType, dimension, learnFac, margin, constr, LType, scores[1], weighted_f1, best_epoch)) file.write("\n")
projects.get_data() # projects.sample(frac = .5) projects.train_test_split(train_size = .67) projects.balance() projects.X_Y_split(y_col = 'funded') X_train = projects.X_train Y_train = projects.Y_train X_test = projects.X_test Y_test = projects.Y_test # Modeling ------------------------------------------------------------- mapper = DataFrameMapper([ ('total_price_excluding_optional_support', preprocessing.StandardScaler()), ('students_reached', preprocessing.StandardScaler()), ('school_state', preprocessing.LabelBinarizer()), ('school_charter', preprocessing.LabelBinarizer()), ('school_magnet', preprocessing.LabelBinarizer()), ('school_year_round', preprocessing.LabelBinarizer()), ('school_nlns', preprocessing.LabelBinarizer()), ('school_kipp', preprocessing.LabelBinarizer()), ('school_charter_ready_promise', preprocessing.LabelBinarizer()), ('teacher_prefix', preprocessing.LabelBinarizer()), ('teacher_teach_for_america', preprocessing.LabelBinarizer()), ('teacher_ny_teaching_fellow', preprocessing.LabelBinarizer()), ('primary_focus_area', preprocessing.LabelBinarizer()), ('resource_type', preprocessing.LabelBinarizer()), ('poverty_level', preprocessing.LabelBinarizer()), ('grade_level', preprocessing.LabelBinarizer()), ])
def _preprocessor(self, x, y=None, training=False): """ Preprocess input of the network. Arguments: - x {pd.DataFrame} -- Raw input array of shape (batch_size, input_size). - y {pd.DataFrame} -- Raw target array of shape (batch_size, 1). - training {boolean} -- Boolean indicating if we are training or testing the model. Returns: - {torch.tensor} -- Preprocessed input array of size (batch_size, input_size). - {torch.tensor} -- Preprocessed target array of size (batch_size, 1). """ ####################################################################### # ** START OF YOUR CODE ** ####################################################################### # Impute/Fill the missing values with the mean: # Could also impute the value using median, KNN or MICE. if training: self.impute_value = x.mean() data = x.fillna(self.impute_value) # Use label binarizer to convert categorical values to one hot # encoded values. if training: # for col in categorical_columns: self.label_binarizer = preprocessing.LabelBinarizer().fit( data['ocean_proximity']) # Continue with transforming cat cols as label_binarizer has already been initialised one_hot_encodings = self.label_binarizer.transform( data['ocean_proximity']) column_labels = self.label_binarizer.classes_ for i, column_label in enumerate(column_labels): data[column_label] = one_hot_encodings[:, i] # Drop categorical columns ('ocean_proximity_cat') column data = data.drop('ocean_proximity', axis=1) if training: # Min max scale each column individually self.x_min_max_scaler = preprocessing.MinMaxScaler().fit( data.values) # if isinstance(y, pd.DataFrame): # self.y_min_max_scaler = preprocessing.MinMaxScaler().fit(y.values) # Apply min-max normalisation # Convert to numpy array column_labels = list(data.columns) # Transform the values of the array by the min max scaler existing data_np_scaled = self.x_min_max_scaler.transform(data.values) # Convert back to Pandas DataFrame data = pd.DataFrame(data_np_scaled, index=data.index, columns=data.columns) # Set columns back to column names data.columns = column_labels # if isinstance(y, pd.DataFrame): # # Apply min-max normalisation # # Convert to numpy array # y_labels = list(y.columns) # # Transform the values of the array by the min max scaler existing # y_np_scaled = self.y_min_max_scaler.transform(y.values) # # Convert back to Pandas DataFrame # y = pd.DataFrame(y_np_scaled, index=y.index, columns=y.columns) # # Set columns back to column names # y.columns = y_labels # Convert pandas dataframes into torch tensors: x_tensor = torch.tensor(data.values) # Replace this code with your own # Return preprocessed x and y, return None for y if it was None return x_tensor, (torch.tensor(y.values) if isinstance( y, pd.DataFrame) else None)
def __init__(self, iter=100): self.iter = iter self.label_binarizer = preprocessing.LabelBinarizer() self.neuron_and1 = lr.BinominalLogisticRegression('and1') self.neuron_and2 = lr.BinominalLogisticRegression('and2') self.neuron_or = lr.BinominalLogisticRegression('or')
def prediction(x_calib, y_calib, x_valid, y_valid, plot_components=False): mse = [] component = np.arange(1, 40) for i in component: pls = PLSRegression(n_components=i) pls.fit(x_calib, y_calib) y_pred = pls.predict(x_valid) mse_p = mean_squared_error(y_valid, y_pred) mse.append(mse_p) comp = 100 * (i + 1) / 40 stdout.write("\r%d%% completed" % comp) stdout.flush() stdout.write("\n") msemin = np.argmin(mse) print("Suggested number of components: ", msemin + 1) stdout.write("\n") if plot_components is True: with plt.style.context(('ggplot')): plt.plot(component, np.array(mse), '-v', color='blue', mfc='blue') plt.plot(component[msemin], np.array(mse)[msemin], 'P', ms=10, mfc='red') plt.xlabel('Number of PLS components') plt.ylabel('MSE') plt.title('PLS') plt.xlim(xmin=-1) plt.show(block=False) _ = raw_input("Press [enter] to continue.") pls = PLSRegression(n_components=msemin + 1) pls.fit(x_calib, y_calib) startTime = time.time() y_pred = pls.predict(x_valid) endTime = time.time() print('Time elapsed: %s seconds' % (endTime - startTime)) lb = preprocessing.LabelBinarizer() score_p = r2_score(y_valid, y_pred) mse_p = mean_squared_error(y_valid, y_pred) lb.fit_transform(y_valid) score = r2_score(y_valid, y_pred) print('R2: %5.3f' % score_p) print('MSE: %5.3f' % mse_p) #print pr = lb.inverse_transform(y_pred) ac = lb.inverse_transform(y_valid) #print type(pr[0]) #print ac sum = 0 for j in range(len(pr)): if np.array_equal(pr[j], ac[j]): sum += 1 print('Accuracy: ' + str((float(sum) / float(len(pr))) * 100) + '%')
def trainer(dict_csv='test.csv'): data = pd.read_csv(dict_csv, engine='python') train_size = int(len(data) * .7) train_posts = data['documents'] train_tags = data['tags'] test_posts = data['documents'][train_size:] test_tags = data['tags'][train_size:] posts = data['documents'] dlp_data = {'filename': [], 'tags': []} vocab_size = 10000 tokenize = text.Tokenizer(num_words=vocab_size) tokenize.fit_on_texts(train_posts) # save token with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Saving tokenizer with name tokenizer.pickle") x_train = tokenize.texts_to_matrix(train_posts) x_test = tokenize.texts_to_matrix(test_posts) x_post = tokenize.texts_to_matrix(posts) encoder = preprocessing.LabelBinarizer() encoder.fit(train_tags) y_train = encoder.transform(train_tags) y_test = encoder.transform(test_tags) text_labels = encoder.classes_ num_labels = len(np.unique(y_train)) batch_size = 1024 model = Sequential() ##Buat hidden layer, gunanya buat naikin akurasi model.add(Dense(512, input_shape=(vocab_size,))) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(128)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(512)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(128)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(64)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(num_labels)) model.add(BatchNormalization()) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(x_train, y_train, batch_size=batch_size, epochs=256, verbose=1, validation_split=0.1) score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) model_json = model.to_json() with open("model.json", "w") as json_file: json_file.write(model_json) print("\n Saved h5 json model to disk with name model.json ") model.save_weights("model.h5") print("\n Saved model to disk with name model.h5") print("Training done") pred = model.predict(np.array(x_post)) pred = pred > 0.5 for i in range(0, len(posts)): print('Document name: %s, is %s' % (data['filename'][i], text_labels[np.argmax(pred[i])])) dlp_data['filename'].append(data['filename'][i]) dlp_data['tags'].append(text_labels[np.argmax(pred[i])]) df = pd.DataFrame(dlp_data, columns=['filename', 'tags']) df.to_csv('dlp.csv', encoding="utf-8") print('Saved CSV model') json_file = open('model.json', 'r') loaded_json_model = json_file.read() json_file.close() loaded_model = model_from_json(loaded_json_model) loaded_model.load_weights("model.h5") print("Loaded model from disk") loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) score = loaded_model.evaluate(x_test, y_test, verbose=1) print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1] * 100))
def reverse_one_hot(originalSeries, newSeries): label_binarizer = pp.LabelBinarizer() label_binarizer.fit(range(max(originalSeries) + 1)) return label_binarizer.inverse_transform(newSeries)
def getHistoryFeature(df): # --get order feature feature = df.groupby('userid')['orderType'].agg( ['sum', 'count']).reset_index().rename(columns={ 'sum': 'order_num_1', 'count': 'order_num' }) feature['order_num_0'] = feature['order_num'] - feature['order_num_1'] feature['order_ratio_0'] = feature['order_num_0'].astype( 'float') / feature['order_num'] feature['order_ratio_1'] = feature['order_num_1'].astype( 'float') / feature['order_num'] f = feature # --get total feature feature = df.groupby('userid')['city', 'country', 'continent'].count().reset_index().rename( columns={ 'city': 'city_num', 'country': 'country_num', 'continent': 'continent_num' }) f = pd.merge(f, feature, on='userid', how='left') # 这个特征可以删除或修改 # feature = df[df.orderType == 1].groupby('userid')['city', 'country', 'continent'].count().reset_index().rename( # columns={'city': 'city_num_1', 'country': 'country_num_1', 'continent': 'continent_num_1'}) # f = pd.merge(f, feature, on='userid', how='left').fillna(0) # for val in ['city_num', 'country_num', 'continent_num']: # f[val.split('_')[0] + '_ratio_1'] = f[val + '_1'].astype('float') / f[val] # --get country feature le = preprocessing.LabelBinarizer() encoder1 = le.fit(df.country.values) le = preprocessing.LabelBinarizer() encoder2 = le.fit(df.continent.values) country_encoder = encoder1.transform(df.country.values) country_encoder_col = [ 'country_%d' % i for i in range(country_encoder.shape[1]) ] df1 = pd.DataFrame(country_encoder, columns=country_encoder_col) df1['userid'] = df['userid'].values feature = df1.groupby('userid')[country_encoder_col].agg(['sum', 'count' ]).reset_index() f = pd.merge(f, feature, on='userid', how='left') # --get continent feature # le = preprocessing.LabelBinarizer() continent_encoder = encoder2.transform(df.continent.values) continent_encoder_col = [ 'continent_%d' % i for i in range(continent_encoder.shape[1]) ] df1 = pd.DataFrame(continent_encoder, columns=continent_encoder_col) df1['userid'] = df['userid'].values feature = df1.groupby('userid')[continent_encoder_col].agg( ['sum', 'count']).reset_index() f = pd.merge(f, feature, on='userid', how='left') # --get orderTime last feature # df1 = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending = False).head(1)).reset_index(drop = True)[['userid','orderid','orderTime','orderType']] # df1.columns = [['userid','last_orderid','last_orderTime','last_orderType']] # f = pd.merge(f, df1, on = 'userid',how = 'left') # --get orderTime last 5 feature # df1 = df.groupby('userid').apply(lambda x: x.sort_values('orderTime', ascending = False).head(5)).reset_index(drop = True)[['userid','orderid','orderTime','orderType']] # df1.columns = [['userid','last_orderid','last_orderTime','last_orderType']] # temp = pd.concat([df1,df1.groupby('userid').rank(method = 'first').astype('int').reset_index().rename( # columns={'last_orderTime': 'last_orderTime_rank'})['last_orderTime_rank']],axis = 1) # ff1 = temp.pivot('userid','last_orderTime_rank','last_orderType') # ff1.columns = ['his_type%d'%i for i in range(ff1.shape[1])] # ff1 = ff1.reset_index() # f = pd.merge(f, ff1, on = 'userid',how = 'left') # ff2 = temp.pivot('userid','last_orderTime_rank','last_orderTime') # ff2.columns = ['his_time%d'%i for i in range(ff2.shape[1])] # ff2 = ff2.reset_index() # f = pd.merge(f, ff2, on = 'userid',how = 'left') return f
def multiclass_roc_auc_score(y_test, y_pred, average="macro"): lb = preprocessing.LabelBinarizer() lb.fit(y_test) y_test = lb.transform(y_test) y_pred = lb.transform(y_pred) return multiclass_roc_auc_score(y_test, y_pred, average=average)
def __init__(self): self.model = pickle.load(open("models/kmeans_model.pkl", "rb")) self.labelmaker = preprocessing.LabelBinarizer()
for row in reader: # print row labelList.append(row[len(row) - 1]) rowDict = {} for i in range(1, len(row) - 1): rowDict[header[i]] = row[i] # print rowDict featureList.append(rowDict) # print featureList #vet vet = DictVectorizer() dummyX = vet.fit_transform(featureList).toarray() lb = preprocessing.LabelBinarizer() dummyY = lb.fit_transform(labelList) # print dummyY #classify clf = tree.DecisionTreeClassifier(criterion='entropy') clf = clf.fit(dummyX, dummyY) with open('common.dot', 'w') as f: f = tree.export_graphviz(clf, out_file=f, feature_names=vet.get_feature_names()) oneRowX = dummyX[0, :] print str(oneRowX) newRowX = oneRowX newRowX[0] = 1
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [ name for name in num_feature_names if is_numeric_dtype(reference_data[name]) ] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [ name for name in cat_feature_names if is_numeric_dtype(reference_data[name]) ] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [ date_column, id_column, target_column, prediction_column ] num_feature_names = list( set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list( set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if target_column is not None and prediction_column is not None: reference_data.replace([np.inf, -np.inf], np.nan, inplace=True) reference_data.dropna(axis=0, how='any', inplace=True) binaraizer = preprocessing.LabelBinarizer() binaraizer.fit(reference_data[target_column]) binaraized_target = binaraizer.transform( reference_data[target_column]) array_prediction = reference_data[prediction_column].to_numpy() prediction_ids = np.argmax(array_prediction, axis=-1) prediction_labels = [prediction_column[x] for x in prediction_ids] #calculate quality metrics if len(prediction_column) > 2: roc_auc = metrics.roc_auc_score(binaraized_target, array_prediction, average='macro') log_loss = metrics.log_loss(binaraized_target, array_prediction) else: roc_auc = metrics.roc_auc_score( binaraized_target, reference_data[prediction_column[0]], average='macro') log_loss = metrics.log_loss( binaraized_target, reference_data[prediction_column[0]]) accuracy_score = metrics.accuracy_score( reference_data[target_column], prediction_labels) avg_precision = metrics.precision_score( reference_data[target_column], prediction_labels, average='macro') avg_recall = metrics.recall_score(reference_data[target_column], prediction_labels, average='macro') avg_f1 = metrics.f1_score(reference_data[target_column], prediction_labels, average='macro') self.wi = BaseWidgetInfo( title="Reference: Model Quality With Macro Average", type="counter", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "counters": [{ "value": str(round(accuracy_score, 3)), "label": "Accuracy" }, { "value": str(round(avg_precision, 3)), "label": "Precision" }, { "value": str(round(avg_recall, 3)), "label": "Recall" }, { "value": str(round(avg_f1, 3)), "label": "F1" }, { "value": str(round(roc_auc, 3)), "label": "ROC AUC" }, { "value": str(round(log_loss, 3)), "label": "LogLoss" }] }, additionalGraphs=[], ) else: self.wi = None