def get_simulation_data(simulation_name, simulation_parameters, test_set_size=4000, validation_set_size=3200): simulation_function = get_simulation_function(simulation_name) try: sequences, y = simulation_function(**simulation_parameters) except Exception as e: return if simulation_name=="simulate_heterodimer_grammar": motif_names = [simulation_parameters["motif1"], simulation_parameters["motif2"]] elif simulation_name=="simulate_multi_motif_embedding": motif_names = simulation_parameters["motif_names"] else: motif_names = [simulation_parameters["motif_name"]] train_sequences, test_sequences, y_train, y_test = train_test_split( sequences, y, test_size=test_set_size) train_sequences, valid_sequences, y_train, y_valid = train_test_split( train_sequences, y_train, test_size=validation_set_size) X_train = one_hot_encode(train_sequences) X_valid = one_hot_encode(valid_sequences) X_test = one_hot_encode(test_sequences) return Data(X_train, X_valid, X_test, y_train, y_valid, y_test, motif_names)
def test_thresholded_scorers(): """Test scorers that take thresholds.""" X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = SCORERS['roc_auc'](clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = SCORERS['log_loss'](clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = SCORERS['roc_auc'](clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test)
def xgb_semi_supervised(trainX,trainY,X_unlabeled,Y_unlabeled): row_count =trainX.shape[0] trainX = np.hstack((trainX,np.array(word_dist_list).reshape(row_count,1))) trainX = np.hstack((trainX,np.array(time_dist_list).reshape(row_count,1))) row_count =X_unlabeled.shape[0] X_unlabeled = np.hstack((X_unlabeled,np.array(word_dist_list_unlabeled).reshape(row_count,1))) X_unlabeled = np.hstack((X_unlabeled,np.array(time_dist_list_unlabeled).reshape(row_count,1))) X_unlabeled,_,Y_unlabeled,_ = train_test_split(X_unlabeled, Y_unlabeled, test_size=0.85, random_state=20) x_train, x_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.25, random_state=20) #concatenate x_train,y_train w/ x_unlabeled and y_unlabeled repectively x_ = np.concatenate((x_train,X_unlabeled),axis=0) x_ = sparse.csr_matrix(x_) y_ = np.concatenate((y_train,Y_unlabeled),axis=0) #y_ = sparse.csr_matrix(y_) #unlabeled_indices = np.arange(x_shape[0])[x_train.shape[0]:] label_prop_model = label_propagation.LabelSpreading(kernel='knn', alpha=1.0) label_prop_model.fit(x_.toarray(),y_) y_pred = label_prop_model.transduction_ #y_ = label_prop_model.predict(x_) xgb_model(x_,x_test,y_pred,y_test)
def splitDataset(data, random_seed): ''' Given a dataframe and a seed value, this function splits out the dataframe into a training set, a validation set, and a test set using the provided seed value for consistency. It uses a 60/20/20 split, but this could easily be parameterized and passed into the function. It returns a dictionary of dataframes with keys train, valid and test. ''' #Get column headers col_headers = list(data.columns.values) feature_cols = copy.deepcopy(col_headers) feature_cols.remove('Sample') feature_cols.remove('Diagnosis') class_col = ['Diagnosis'] #Train/test/validate split train, test = train_test_split(data, test_size=0.2, random_state=random_seed) train = pd.DataFrame(train) test = pd.DataFrame(test) train.columns = col_headers test.columns = col_headers train, validate = train_test_split(train, test_size=0.25, random_state=random_seed) train = pd.DataFrame(train) validate = pd.DataFrame(validate) train.columns = col_headers validate.columns = col_headers #Separate features and classes all_data = {'train': train, 'valid': validate, 'test': test} return extractFeatures(all_data)
def processMethod3(userid, featureCondition=1, classificationCondition=1, offsetFeatureOn=False): """ User-i Device-j hack in User-i Device-k Model: iphone6plus hack iphone5 Returns ------- float : error rate """ # rawDataiPhone6Plus = loadUserData(userid, 1, datatype=1) # moment data # rawDataiPhone5 = loadUserData(userid, 2, datatype=1) # moment data # trainingData = splitMomentDataByFeature(rawDataiPhone5, featureCondition=featureCondition) # trainingLabel = rawDataiPhone5[:, 4] # testData = splitMomentDataByFeature(rawDataiPhone6Plus, featureCondition=featureCondition) # testLabel = rawDataiPhone6Plus[:, 4] iPhone6Plus = 1 iPhone5 = 2 trainingData, trainingLabel = splitMomentDataByFeatureAndLabel(userid, iPhone5, featureCondition, classificationCondition, offsetFeatureOn=offsetFeatureOn) testData, testLabel = splitMomentDataByFeatureAndLabel(userid, iPhone6Plus, featureCondition, classificationCondition, offsetFeatureOn=offsetFeatureOn) # use same test size with method1 trainingDataIP5, testDataIP5, trainingLabelIP5, testLabelIP5 = train_test_split(trainingData, trainingLabel, test_size=my_test_size, random_state=my_random_state) trainingDataIP6, testDataIP6, trainingLabelIP6, testLabelIP6 = train_test_split( testData, testLabel, test_size=my_test_size, random_state=my_random_state) return classify(trainingDataIP5, trainingLabelIP5, testDataIP6, testLabelIP6, kernel=my_kernel, max_iter=my_max_iteration)
def dump_data_2_pickle(gsr_file, pickleFile): """ dump the txt gsr file data into picke :type gsr_file: string :param gsr_file: path to gsr file, default: gsr_article/gsr_spanish.txt :type pickleFile: string :param pickleFile: path to pickle file, default: ../data/dataset.pkl """ # generate docs and gsrs docs, gsrs = generate_docs(gsr_file) # shuffle the data dataset = zip(docs, gsrs) train_set, test_set = train_test_split(dataset, test_size=0.3, random_state=10) valid_set, test_set = train_test_split(test_set, test_size=0.5, random_state=11) # construct the vocab list and transfer the data into word num word2id = {} # set UNKNOW word as UUKK word2id["UNK"] = 0 word2id["<S>"] = 1 word2id["</S>"] = 2 word2id["<PAD>"] = 3 pop2id = {} type2id = {} wid = 4 pid = 0 tid = 0 for doc, gsr in train_set: for sen in doc: for token in sen: if token not in word2id: word2id[token] = wid wid += 1 pop = gsr["population"] eType = gsr["eventType"] if pop not in pop2id: pop2id[pop] = pid pid += 1 if eType not in type2id: type2id[eType] = tid tid += 1 train_set = transform_set(train_set, word2id, pop2id, type2id) valid_set = transform_set(valid_set, word2id, pop2id, type2id) test_set = transform_set(test_set, word2id, pop2id, type2id) with open(pickleFile, 'w') as pf: cPickle.dump(train_set, pf) cPickle.dump(valid_set, pf) cPickle.dump(test_set, pf) cPickle.dump(word2id, pf) cPickle.dump(pop2id, pf) cPickle.dump(type2id, pf)
def tribunalTrain(data,predict,tribunal,split=.2,stat=False,statLis=None): #data for testing the tribunal performance, not in actual judge training dat_train, dat_test, lab_train, lab_test = train_test_split(data,predict, test_size=split) verdict = [] print 'Tribunal in session' for judge in tribunal: jdat_train, jdat_test, jlab_train, jlab_test = train_test_split(dat_train,lab_train, test_size=split) judge.fit(jdat_train, jlab_train) print 'judge trained' for d in dat_test: votes = [] for judge in tribunal: v = judge.predict(d) votes.append(v) decision = stats.mode(votes,axis=None) verdict.append(decision[0]) npVerdict = np.array(verdict) if stat == False: svmDesc(npVerdict,lab_test,title='Tribunal Confusion Matrix') else: jac = jaccard_similarity_score(npVerdict,lab_test) statLis.append(jac)
def Adaboost(TrainData,TestData): features=['Time','Season','Hour','Minute','District'] clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30) size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for i in range(0,len(size)): train,validation= train_test_split(TrainData, train_size=size[i]) while len(set(train['Category'])) != len(set(validation['Category'])): train,validation= train_test_split(TrainData, train_size=size[i]) clf = clf.fit(train[features], train['Category']) """stop = timeit.default_timer() print "Runnin time adaboost is ", stop-start""" predicted=np.array(clf.predict_proba(validation[features])) model=clf.predict(train[features]) model1=clf.predict(validation[features]) #scores = cross_val_score(clf, validation[features], validation['Category']) #print "Scores mean is",scores.mean() #accuracy print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model) print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1) print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro') print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro') print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None) #writing to file """Category_new=[]
def tuning_l2_penalty(out_file, featurizers = None): # featurizers for blog/blog, twitter+wiki/blog and twitter+wiki/twitter+wiki respectively if not featurizers: featurizers = [feat4, feat5, feat4] # used to weigh L-2 penalty c_vals = [ v / 100.0 for v in range(50, 110, 10)] # data splits used b_train, b_test = train_test_split(blog_80, test_size = 0.1, random_state = 1) tw_train, tw_test = train_test_split(tw, test_size = 0.1, random_state = 1) # count sizes only once n_btest = float(len(b_test)) n_b80 = float(len(blog_80)) n_twtest = float(len(tw_test)) for c_val in c_vals: print "Running l-2 tunning for C:%.2f" % c_val # Using split validation, as otherwise too slow make_model = lambda: Models.LogisticRegression(C = c_val) blog_errors = error_analyze(make_model, b_train, b_test, featurizers[0]) twb_errors = error_analyze(make_model, tw, blog_80, featurizers[1]) tw_errors = error_analyze(make_model, tw_train, tw_test, featurizers[2]) blog_acc = 1 - len(blog_errors["error_indices"]) / n_btest twb_acc = 1 - len(twb_errors['error_indices']) / n_b80 tw_acc = 1 - len(tw_errors['error_indices']) / n_twtest # write to file provided out_file.write("C=%f\n" % c_val) out_file.write("b=%f, twb=%f, tw=%f\n\n" % (blog_acc, twb_acc, tw_acc))
def split_train_test_with_common_vocabulary(sparse_data: dict, test_size: float): # seed = random.randint(0, 2 ** 32) # TODO: Enable seed = 1 train = {"unigrams": sparse_data["unigrams"], "counts": {}} test = {"unigrams": sparse_data["unigrams"], "counts": {}} coordinates_train, coordinates_test = cross_validation.train_test_split(sparse_data["coordinates"], test_size=test_size, random_state=seed) train["coordinates"] = coordinates_train test["coordinates"] = coordinates_test features = (feature for feature in sparse_data.keys() if feature not in ["coordinates", "counts", "unigrams"]) for feature in features: sparse_train, sparse_test = cross_validation.train_test_split(sparse_data[feature], test_size=test_size, random_state=seed) train[feature] = sparse_train test[feature] = sparse_test # [0] is because this is a matrix, so we get list of lists train["counts"][feature] = np.asarray(sparse_train.sum(axis=0)).flatten().tolist() test["counts"][feature] = np.asarray(sparse_test.sum(axis=0)).flatten().tolist() return train, test
def test_train_test_split(): X = np.arange(100).reshape((10, 10)) X_s = coo_matrix(X) y = np.arange(10) # simple test split = cval.train_test_split(X, y, test_size=None, train_size=.5) X_train, X_test, y_train, y_test = split assert_equal(len(y_test), len(y_train)) # test correspondence of X and y assert_array_equal(X_train[:, 0], y_train * 10) assert_array_equal(X_test[:, 0], y_test * 10) # conversion of lists to arrays (deprecated?) split = cval.train_test_split(X, X_s, y.tolist(), allow_lists=False) X_train, X_test, X_s_train, X_s_test, y_train, y_test = split assert_array_equal(X_train, X_s_train.toarray()) assert_array_equal(X_test, X_s_test.toarray()) # don't convert lists to anything else by default split = cval.train_test_split(X, X_s, y.tolist()) X_train, X_test, X_s_train, X_s_test, y_train, y_test = split assert_true(isinstance(y_train, list)) assert_true(isinstance(y_test, list)) # allow nd-arrays X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) split = cval.train_test_split(X_4d, y_3d) assert_equal(split[0].shape, (7, 5, 3, 2)) assert_equal(split[1].shape, (3, 5, 3, 2)) assert_equal(split[2].shape, (7, 7, 11)) assert_equal(split[3].shape, (3, 7, 11))
def load_data(): ''' Loads the data, turns into word2vec representation, and splits into training, validation, and testing sets with ratio 8:1:1 ''' trainingDataFile = '../data/traindata.txt' trainingPosFile = '../data/pos_Embedding.txt' trainingLabelFile = '../data/trainlabel.txt' wordToVecDictFile = '../data/glove/glove.6B.50d.txt' print('Vectorizing the features and labels...') start_time = timeit.default_timer() X,Y = word2vec.createVecFeatsLabels(trainingDataFile,trainingPosFile,trainingLabelFile,wordToVecDictFile,window_size) end_time = timeit.default_timer() print('Pickling the vectorization files') # pickling X-file clean_data = open('../data/clean_data.pkl','wb') pickle.dump(X, clean_data) clean_data.close() # pickling the labels-file clean_label = open('../data/clean_label.pkl', 'wb') pickle.dump(Y, clean_label) clean_label.close() print(('The vectorization ran for %.2fm' % ((end_time - start_time) / 60.))) print('Splitting into training, validation, and testing sets ...') X_train, X_rest, y_train, y_rest = train_test_split(X, Y, test_size=0.2, random_state=42) X_val, X_test, y_val, y_test = train_test_split(X_rest,y_rest, test_size=0.5, random_state=42) return X_train, X_val, X_test, y_train, y_val, y_test
def load_dataset(path_id="", folder="", use_float_32=False, test_ratio=0.3, valid_ratio=0.1): #def load_dataset(path_id="", use_float_32=False, test_ratio=0.2, valid_ratio=0.1): # reading full dataset features_path = "data/%s/features%s.npy"%(folder, path_id) labels_path = "data/%s/labels%s.npy"%(folder, path_id) features = np.load(features_path) if use_float_32: features = features.astype(np.float32) labels = np.load(labels_path) # splitting data train_set_x, test_set_x, train_set_y, test_set_y = train_test_split(features, labels, test_size=test_ratio, random_state=89677) #train_set_x = features[:2500] #train_set_y = labels[:2500] #test_set_x = features[2500:] #test_set_y = labels[2500:] test_set_x = theano.shared(value=test_set_x, name='test_set_x', borrow=True) test_set_y = theano.shared(value=np.array(test_set_y), name='test_set_y', borrow=True) # split train set into validation set train_set_x, valid_set_x, train_set_y, valid_set_y = train_test_split(train_set_x, train_set_y, test_size=valid_ratio, random_state=89677) print train_set_x.shape, valid_set_x.shape, test_set_x.get_value(borrow=True).shape train_set_x = theano.shared(value=train_set_x, name='train_set_x', borrow=True) train_set_y = theano.shared(value=np.array(train_set_y), name='train_set_y', borrow=True) valid_set_x = theano.shared(value=valid_set_x, name='valid_set_x', borrow=True) valid_set_y = theano.shared(value=np.array(valid_set_y), name='valid_set_y', borrow=True) return ((train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y))
def load_data_sets(input_data, labels, split_only=True, valid_set=False): class DataSets(object): pass data_sets = DataSets() print("\nSplitting to Train & Test sets for Finetuning") if valid_set: train_examples, test_examples, train_labels, test_labels = \ train_test_split(input_data, labels, test_size=0.2) train_examples, validation_examples, train_labels, validation_labels = \ train_test_split(train_examples, train_labels, test_size=0.05) data_sets.validation = DataSet(validation_examples, validation_labels) else: train_examples, test_examples, train_labels, test_labels = \ train_test_split(input_data, labels, test_size=0.3) data_sets.validation = None # validation_examples = input_data[:VALIDATION_SIZE] # train_examples = input_data[VALIDATION_SIZE:] data_sets.train = DataSet(train_examples, train_labels) data_sets.test = DataSet(test_examples, test_labels) if not split_only: data_sets.all = DataSet(input_data, labels) return data_sets
def get_best_k_model(model, max_k, x, y): # Fit a model using a range of best-k values, # returning the model that produces the best test score # Input # model: scikit-learn model # max_k: maximum k-value to iterate to (inclusive) # x: independent variables # y: dependent variable # Output # best_k: Number of dependent variables using to produce output # train_score: training score # test_score: test score # train_mse: training mse # test_mse: test mse test_scores = [] k_vals = [] k_limit = min(max_k, len(x.columns)) for k_val in range(1, k_limit + 1): best_x = fs.SelectKBest(fs.chi2, k = k_val).fit_transform(x, y) x_train, x_test, y_train, y_test = cv.train_test_split(best_x, y, test_size = 0.2, random_state = 0) test_scores.append(model.fit(x_train, y_train).score(x_test, y_test)) k_vals.append(k_val) best_k = k_vals[np.argmax(test_scores)] best_x = fs.SelectKBest(fs.chi2, k = best_k).fit_transform(x, y) x_train, x_test, y_train, y_test = cv.train_test_split(best_x, y, test_size = 0.2, random_state = 0) train_score, test_score, train_mse, test_mse = get_model_values(model, x_train, y_train, x_test, y_test) return best_k, train_score, test_score, train_mse, test_mse
def _parallel_eval(Classifier, params, X, y, w, n_repeat=5, verbose=1): if verbose > 0: print "[Start]", params thresholds, scores = [], [] for i in range(n_repeat): if verbose > 0: print "Fold", i _, X_fold, _, y_fold, _, w_fold = train_test_split(X, y, w, train_size=0.5, random_state=i) X_pred = load_predictions("stack/*-fold%d.npy" % i) X_fold = np.hstack((X_fold, X_pred)) X_train, X_valid, y_train, y_valid, w_train, w_valid = train_test_split(X_fold, y_fold, w_fold, train_size=0.33, random_state=i) X_train = np.asfortranarray(X_train, dtype=np.float32) w_train = rescale(w_train) w_train = rebalance(y_train, w_train) clf = Classifier(**params) try: clf = clf.fit(X_train, y_train, sample_weight=w_train) except: clf = clf.fit(X_train, y_train) threshold, score, _ = find_threshold(clf, X_valid, y_valid, w_valid) thresholds.append(threshold) scores.append(score) if verbose > 0: print "[End]", params, np.mean(thresholds), np.mean(scores) return (np.mean(scores), np.mean(thresholds), params, thresholds, scores)
def cook(): x, y, weights = load_data() n_components = 200 svd = TruncatedSVD(n_components, random_state=42) x_unweighted = svd.fit_transform(x) x_weighted = svd.fit_transform(weighted(x, weights)) for i in range(9): frac = 1 - (i * 0.01 + 0.01) print frac x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac) classifier = AdaBoostClassifier(n_estimators=100) classifier.fit(x_train, y_train) print "Unweighted: ", classifier.score(x_test, y_test) x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac) classifier = AdaBoostClassifier(n_estimators=100) classifier.fit(x_train, y_train) print "Weighted: ", classifier.score(x_test, y_test) print '--------------------------' '''
def split(data, size): grouped = data.groupby('LOG BB') bbb_neg = grouped.get_group(0.0) bbb_pos = grouped.get_group(1.0) descriptor_n = bbb_neg.shape[1] # descriptor_n = 2756 # descriptor_n = 30 n = bbb_neg.shape[0] # n = 850 # n = 0 x_pos = bbb_pos.iloc[:n,0:descriptor_n-1].values y_pos = bbb_pos.iloc[:n,descriptor_n-1:descriptor_n].values x_pos_train, x_pos_test, y_pos_train, y_pos_test = train_test_split(x_pos, y_pos, test_size=size, random_state=100) x_neg = bbb_neg.iloc[:,0:descriptor_n-1].values y_neg = bbb_neg.iloc[:,descriptor_n-1:descriptor_n].values x_neg_train, x_neg_test, y_neg_train, y_neg_test = train_test_split(x_neg, y_neg, test_size=size, random_state=100) x_train = np.append(x_pos_train, x_neg_train, axis = 0) y_train = np.append(y_pos_train, y_neg_train, axis = 0) x_test = np.append(x_pos_test, x_neg_test, axis = 0) y_test = np.append(y_pos_test, y_neg_test, axis = 0) return x_train, x_test, y_train, y_test
def get_splitted_data_for_Cyst(): # Load the raw data (all_images, image_class) = loadImages_for_Cyst() # test / train split #X_, X_test, y_, Y_test = train_test_split(all_images, image_class, test_size=0.20, random_state=42) #X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.20, random_state=42) # Divide the data into a train and test set. X_train, X_test, T_train, T_test = cross_validation.train_test_split(all_images, image_class, test_size=0.2) # Divide the test set into a validation set and final test set. X_validation, X_test, T_validation, T_test = cross_validation.train_test_split(X_test, T_test, test_size=0.2) #print("Total: ", len(all_images), "Train", str(len(X_train)), ", Val: ", len(X_val), ",Test: ", len(X_test)) # Normalize the data: subtract the mean image mean_image = np.mean(X_train, axis=0) X_train -=mean_image X_validation -= mean_image X_test -=mean_image return X_train, T_train, X_validation, T_validation, X_test, T_test
def conv_demo(): # load the digits dataset digits = load_digits() X = digits['data'] y_labels = digits['target'] lb = LabelBinarizer() y = lb.fit_transform(y_labels) # split into training, validation and test datasets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=RANDOM_STATE) # train the neural net print("Building neural net to classify digits") conv_net = pynn.ConvNet(digits['images'][0].shape, 1, y.shape[1], random_state=RANDOM_STATE) print("Training") conv_net.fit(X_train, y_train, X_valid, y_valid, batch_size=20, n_epochs=20, learning_rate=0.05) y_pred = conv_net.predict(X_test) print("digits accuracy: {}%".format( accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
def __init__(self, data, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False, fraction_test=0.01): if batch_range == None: raise Exception('the range is empty') if init_batchnum is None or init_batchnum not in batch_range: init_batchnum = batch_range[0] self.data_dir = None self.batch_range = batch_range self.curr_epoch = init_epoch self.curr_batchnum = init_batchnum self.dp_params = dp_params self.batch_meta = None self.data_dic = None self.test = test self.batch_idx = batch_range.index(init_batchnum) self.X = data[0] self.y = data[1] self.fraction_test = fraction_test if self.y is not None: print 'data is: {}, X shape {}, y shape {}'.format(len(data), self.X.shape,self.y.shape) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=self.fraction_test, random_state=42) else: print 'data is: {}, X shape {}'.format(len(data), self.X.shape) self.X_train, self.X_test = train_test_split(self.X, test_size=self.fraction_test, random_state=42) self.y_train = np.array([0] * self.X_train.shape[0],dtype=np.float32) self.y_test = np.array([0] * self.X_test.shape[0],dtype=np.float32)
def main(): X, Y, encoder, scale = load_train_data('train.csv') estimators = 500 X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=0) X_train_real, X_test_real, Y_train_real, Y_test_real = train_test_split(X_train, Y_train, test_size=0.2, random_state=42) log.info('Loaded training file') X_test, _ = load_csv_file('test.csv', cut_end=False) log.info('Loaded test file') #Classifier Setup tree_clf = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1, random_state=42, max_depth=55, min_samples_split=1) clf = make_pipeline(TfidfTransformer(), DenseTransformer(), tree_clf) log.info('Fitting GradientBoost') clf.fit(X_train_real, Y_train_real) clf_probs = clf.predict_proba(X_test_real) score = log_loss(Y_test_real, clf_probs) log.info('Log Loss score un-trained = %f' % score) # Calibrate Classifier using ground truth in X,Y_valid sig_clf = CalibratedClassifierCV(clf, method="isotonic", cv="prefit") log.info('Fitting CalibratedClassifierCV') sig_clf.fit(X_valid, Y_valid) sig_clf_probs = sig_clf.predict_proba(X_test_real) sig_score = log_loss(Y_test_real, sig_clf_probs) log.info('Log loss score trained = %f' % sig_score) # Ok lets predict the test data with our funky new classifier sig_submission_probs = sig_clf.predict_proba(X_test) write_out_submission(sig_submission_probs, 'submission.csv')
def getImages(): digitsImagesNormalized = getImagesFromDir(digitsPath) lettersImagesNormalized = getImagesFromDir(lettersPath) digitsImagesNormalized = [skpre.scale(digitsImagesNormalized[0]), digitsImagesNormalized[1]] lettersImagesNormalized = [skpre.scale(lettersImagesNormalized[0]), lettersImagesNormalized[1]] allImages = [] for i in digitsImagesNormalized[0]: allImages.append(i) for i in lettersImagesNormalized[0]: allImages.append(i) # Divide em teste e treino. # Calcula PCA - Reducao de dimensionalidade dos dados. :) pca = computePCA(allImages) digitstransformedData = pca.transform(digitsImagesNormalized[0]) letterstransformedData = pca.transform(lettersImagesNormalized[0]) dtrainDataTF, dtestDataTF, dclassesTrainTF, dclassesTestTF = train_test_split(digitstransformedData, digitsImagesNormalized[1], train_size=0.65) ltrainDataTF, ltestDataTF, lclassesTrainTF, lclassesTestTF = train_test_split(letterstransformedData, lettersImagesNormalized[1], train_size=0.65) return [[dtrainDataTF, dclassesTrainTF], [dtestDataTF, dclassesTestTF]], [[ltrainDataTF, lclassesTrainTF], [ltestDataTF, lclassesTestTF]]
def split_dataset(index, random_state, test_ratio=0.2, valid_ratio=0.2): index = list(index) ix_train, ix_test = train_test_split(index, test_size=test_ratio, random_state=random_state) ix_train, ix_valid = train_test_split(ix_train, test_size=valid_ratio / (1 - test_ratio), random_state=random_state) return {'train': ix_train, 'valid': ix_valid, 'test': ix_test}
def create_sets(img_dir, train_set_proportion=.6, test_set_proportion=.2, val_set_proportion=.2): '''Split a list of image files up into training, testing and validation sets.''' if os.path.isfile(img_dir+ 'imgs.list'): baseimgfilenames = pickle.load(open(img_dir+'imgs.list','rb')) else: imgfilenames = glob.glob(img_dir + '*.jpg') baseimgfilenames = [os.path.basename(f) for f in imgfilenames] train,val = train_test_split(np.arange(len(baseimgfilenames)), train_size=train_set_proportion+test_set_proportion, test_size=val_set_proportion, random_state=1) train_test_prop = train_set_proportion + test_set_proportion train,test = train_test_split(train, train_size=train_set_proportion/train_test_prop, test_size=test_set_proportion/train_test_prop, random_state=1) trainfiles = [baseimgfilenames[i] for i in train] valfiles = [baseimgfilenames[i] for i in val] testfiles = [baseimgfilenames[i] for i in test] return trainfiles, valfiles,testfiles
def main(unused_argv): iris = datasets.load_iris() x_train, x_test, y_train, y_test = train_test_split( iris.data, iris.target, test_size=0.2, random_state=42) x_train, x_val, y_train, y_val = train_test_split( x_train, y_train, test_size=0.2, random_state=42) val_monitor = learn.monitors.ValidationMonitor( x_val, y_val, early_stopping_rounds=200) # classifier with early stopping on training data classifier1 = learn.DNNClassifier( hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model/') classifier1.fit(x=x_train, y=y_train, steps=2000) score1 = metrics.accuracy_score(y_test, classifier1.predict(x_test)) # classifier with early stopping on validation data, save frequently for # monitor to pick up new checkpoints. classifier2 = learn.DNNClassifier( hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model_val/', config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1)) classifier2.fit(x=x_train, y=y_train, steps=2000, monitors=[val_monitor]) score2 = metrics.accuracy_score(y_test, classifier2.predict(x_test)) # In many applications, the score is improved by using early stopping print('score1: ', score1) print('score2: ', score2) print('score2 > score1: ', score2 > score1)
def create_sets(img_dir, train_set_proportion=.6, test_set_proportion=.2, val_set_proportion=.2): '''Split a list of image files up into training, testing and validation sets.''' imgfilenames = glob.glob(img_dir + '*.jpg') baseimgfilenames = [os.path.basename(f) for f in imgfilenames] if train_set_proportion + test_set_proportion < 1: train,val = train_test_split(np.arange(len(baseimgfilenames)), train_size=train_set_proportion+test_set_proportion, test_size=val_set_proportion, random_state=1) else: train = np.arange(len(baseimgfilenames)) val = [] train_test_prop = train_set_proportion + test_set_proportion train,test = train_test_split(train, train_size=train_set_proportion/train_test_prop, test_size=test_set_proportion/train_test_prop, random_state=1) trainfiles = [baseimgfilenames[i] for i in train] testfiles = [baseimgfilenames[i] for i in test] valfiles = [baseimgfilenames[i] for i in val] return trainfiles, valfiles,testfiles
def train_lsvr(): train_sys = np.load('fc2_train_sys.npy') test_sys = np.load('fc2_test_sys.npy') # from sklearn.preprocessing import StandardScaler # sle = StandardScaler() # train_sys = sle.fit_transform(train_sys) # test_sys = sle.fit_transform(test_sys) y = np.load('data/y_train.npy') from sklearn import svm #from sklearn.metrics import mean_squared_error from sklearn.ensemble import RandomForestRegressor lsvr = svm.SVR(C=0.1) # 0.045 #lsvr = RandomForestRegressor(n_estimators = 100) train_sys, val_sys, train_y_sys, val_y_sys = train_test_split(train_sys, y[:,0]) lsvr.fit(train_sys, train_y_sys) #print mean_squared_error(val_y_sys, l pred_systole = lsvr.predict(val_sys) cdf_val = real_to_cdf(val_y_sys) cdf_pred_systole = real_to_cdf(pred_systole) crps_val = crps(cdf_val, cdf_pred_systole) print('CRPS(val sys) = {0}'.format(crps_val)) train_dia = np.load('fc2_train_dia.npy') test_dia = np.load('fc2_test_dia.npy') train_dia, val_dia, train_y_dia, val_y_dia = train_test_split(train_dia, y[:,1]) lsvr.fit(train_dia, train_y_dia) pred_dia = lsvr.predict(val_dia) cdf_val_dia = real_to_cdf(val_y_dia) cdf_pred_dia = real_to_cdf(pred_dia) crps_val = crps(cdf_val_dia, cdf_pred_dia) print('CRPS(val dia) = {0}'.format(crps_val))
def split_data(x_train, y_train): """ Given training data cropped from the original dataset by create_training_set.py, split this data up into training, cross-validation, and test data. INPUTS: x_train = Features cropped from original dataset y_train = Labels manually inputed from x_train OUTPUTS: new_x_train = New training data randomly selected from x_train new_x_crossval = Cross-validation samples from x_train new_x_test = Test samples from x_train new_y_train = Training labels new_y_crossval = Cross-validation labels new_y_test = Testing labels """ new_x_train, new_x_test, new_y_train, new_y_test \ = cross_val.train_test_split(x_train, y_train, test_size=0.3, random_state=53) new_x_crossval, new_x_test, new_y_crossval, new_y_test \ = cross_val.train_test_split(new_x_test, new_y_test, test_size=0.5, random_state=41) return new_x_train, new_x_crossval, new_x_test, new_y_train, \ new_y_crossval, new_y_test
def iris_demo(): # load the iris dataset iris = load_iris() X = iris['data'] y_labels = iris['target'] lb = LabelBinarizer() y = lb.fit_transform(y_labels) # split into training, validation and test datasets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=RANDOM_STATE) # train the neural net print("Building logistic regression classifier to classify iris data") nn = pynn.ArtificialNeuralNet([X_train.shape[1], 20, y_train.shape[1]]) print("Training") nn.fit(X_train, y_train, X_valid, y_valid, batch_size=20, n_epochs=20, learning_rate=0.05, random_state=RANDOM_STATE) y_pred = nn.predict(X_test) print("iris accuracy: {}%".format( accuracy_score(y_test.argmax(1), y_pred.argmax(1)) * 100))
x_dataset = [] Y = [] for filename in gb.glob('datasets/imagenes/*.ppm'): img = misc.imread(filename) x_dataset.append(img) Y.append(filename) ## 2) preprocess X = [] for img in x_dataset: # 2.1. Convertir a escala de grises gray_img = color.rgb2gray(img) # 2.2. Ecualizar imagen eq = exposure.equalize_hist(gray_img) # 2.3. Algun filtro blur = gaussian(eq, sigma=1) # binarizar imagen bin_img = (blur > blur.mean()).astype(int) plot_image(blur) plot_image(bin_img) # 2.4. Aplanar imagen X.append(np.reshape(bin_img, [-1])) ipdb.set_trace() ## 3) Dividir en training, test xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
@author: hp """ import numpy as np from sklearn import preprocessing, cross_validation, neighbors, svm import pandas as pd df = pd.read_csv('breast-cancer-wisconsin.txt') df.replace('?', -99999, inplace=True) df.drop(['id'], 1, inplace=True) #print( df.head() ) X = np.array( df.drop(['class'],1) ) y = np.array( df['class'] ) X_train , X_test, y_train, y_test = cross_validation.train_test_split(X,y, test_size=0.2 ) clf = svm.SVC(n_jobs =-1) clf.fit( X_train , y_train ) accuracy = clf.score(X_test, y_test) #print( accuracy ) example_measures = np.array([2,7,10,10,7,10,4,9,4]) example_measures = example_measures.reshape(len( example_measures ),-1) predection = clf.predict(example_measures) print( predection )
depen = dataset.iloc[:, 3].values """ #Taking Car Of Missing Data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) #Grab the null values imputer = imputer.fit(inpen[:, 1:3]) #Get the mean of all the other values in each of these columns inpen[:, 1:3] = imputer.transform(inpen[:, 1:3]) # Set the null vals to the calculated mean #Encoding Categorical Data from sklearn.preprocessing import LabelEncoder, OneHotEncoder encoder_country = LabelEncoder() inpen[:, 0] = encoder_country.fit_transform(inpen[:,0]) onehotencoder = OneHotEncoder(categorical_features= [0]) inpen = onehotencoder.fit_transform(inpen).toarray() encoder_purchased = LabelEncoder() depen = encoder_purchased.fit_transform(depen) """ #Splitting The Dataset Into The Training And Testing Set from sklearn.cross_validation import train_test_split inpen_train, inpen_test, depen_train, depen_test = train_test_split( inpen, depen, test_size=0.2) """ #Feature Scaling from sklearn.preprocessing import StandardScaler stdScale_inpen = StandardScaler() inpen_train = stdScale_inpen.fit_transform(inpen_train) #We need to fit the training set before we transform it inpen_test = stdScale_inpen.transform(inpen_test) #We do not need to fit the test set before we transform it because it's already fitted to the training set """
learning_rate=0.01, n_estimators=550, subsample=0.5, colsample_bytree=0.5, seed=0) clf.fit(train_x, train_y) test_y = clf.predict_proba(test_x)[:, 1] makesubmission(test_y) print 'done' # sn = model2.degit_network(units=args.units,gpu=args.gpu) sn = model.shoot_network(units=args.units, gpu=args.gpu) if args.train > 0: print 'predict validation test set' X_train, X_test, y_train, y_test = cross_validation.train_test_split( train_x, train_y, test_size=0.2, random_state=0) clf = svm.SVC() clf.fit(X_train, y_train) pred = clf.fit(X_test) print 'svm logloss', sn.logloss(y_test, pred) sn.fit(X_train, y_train, n_epoch=args.epoch, batchsize=args.batchsize, save=False) pred = sn.predict(X_test) print 'logloss of test set:', sn.logloss(y_test, pred)
glass_data.loc[glass_data.Type.between(1, 4), 'binary'] = 0 glass_data.loc[glass_data.Type.between(5, 7), 'binary'] = 1 print glass_data.head() #print glass_data[(glass_data.Type > 2) & (glass_data.Type < 7)] #part 2 X = glass_data[[ 'Ref Index', 'Sodium', 'Mag', 'Alum', 'Silicon', 'Potas', 'Calcium', 'Barium', 'Iron', 'Type' ]] y = glass_data.binary print X.shape print y.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #part 3 #fit the model LR = LogisticRegression() LR.fit(X_train, y_train) B1 = LR.coef_[0][0] B0 = LR.intercept_[0] print B1, "B1", B0, "B0" print np.exp(B1), "significant? Yes, I believe so." prob = LR.score(X_test, y_test) print prob, "model accuracy score" #make predictions preds = LR.predict(X_test)
#to give column names dataset.columns = [] #Diving the dataset into independent and dependent variables X = dataset.iloc[:, :].values Y = dataset.iloc[:, :].values # Column names list(dataset) #Splitting the data into training set and validation set from sklearn.cross_validation import train_test_split train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2, random_state=0) #Correlation Matrix corr_matrix = train_X.corr() f, ax = plt.subplots(figsize=(16, 10)) ax = sns.heatmap(corr_matrix) ax.set_title("correlation between all features") figure = ax.get_figure() #Do feature scaling if required. #Convert into categorical variable if required #Model from sklearn.linear_model import LinearRegression
# In[17]: from sklearn.cross_validation import cross_val_score # In[18]: from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn import metrics # In[22]: # use train/test split with different random_state values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4) # check classification accuracy of KNN with K=5 knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) print(metrics.accuracy_score(y_test, y_pred)) # In[23]: # simulate splitting a dataset of 25 observations into 5 folds from sklearn.cross_validation import KFold kf = KFold(25, n_folds=5, shuffle=False) # print the contents of each training and testing set print('{} {:^61} {}'.format('Iteration', 'Training set observations',
def load_data(dataset, nframes=13, features='MFCC', scaling='normalize', pca_whiten=0, cv_frac=0.2, dataset_name='timit', speakers=False, numpy_array_only=False): """ params: - dataset: folder - nframes: number of frames to replicate/pad - features: 'MFCC' (13 + D + A = 39) || 'fbank' (40 coeffs filterbanks) || 'gamma' (50 coeffs gammatones) - scaling: 'none' || 'unit' (put all the data into [0-1]) || 'normalize' ((X-mean(X))/std(X)) || student ((X-mean(X))/std(X, deg_of_liberty=1)) - pca_whiten: not if 0, MLE if < 0, number of components if > 0 - cv_frac: cross validation fraction on the train set - dataset_name: prepended to the name of the serialized stuff - speakers: if true, Ys (labels) are speakers instead of phone's states """ params = { 'nframes_mfcc': nframes, 'features': features, 'scaling': scaling, 'pca_whiten_mfcc_path': 'pca_' + str(pca_whiten) + '.pickle' if pca_whiten else 0, 'cv_frac': cv_frac, 'theano_borrow?': BORROW, 'use_caching?': USE_CACHING, 'train_classifiers_1_frame?': TRAIN_CLASSIFIERS_1_FRAME, 'train_classifiers?': TRAIN_CLASSIFIERS, 'dataset_name': dataset_name, 'speakers?': speakers } with open('prep_' + dataset_name + '_params.json', 'w') as f: f.write(json.dumps(params)) suffix = scaling if speakers: suffix += "_spkr" def prep_and_serialize(): [train_x, train_y, test_x, test_y, dev_x, dev_y] = prep_data(dataset, nframes=nframes, features=features, scaling=scaling, pca_whiten=pca_whiten, dataset_name=dataset_name, speakers=speakers, dev=(cv_frac == 'fixed')) with open( prefix_path + 'train_x_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'wb') as f: np.save(f, train_x) with open( prefix_path + 'train_y_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'wb') as f: np.save(f, train_y) with open( prefix_path + 'test_x_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'wb') as f: np.save(f, test_x) with open( prefix_path + 'test_y_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'wb') as f: np.save(f, test_y) if dev_x != None: with open( prefix_path + 'dev_x_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'wb') as f: np.save(f, dev_x) if dev_y != None: with open( prefix_path + 'dev_y_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'wb') as f: np.save(f, dev_y) print ">>> Serialized all train/test tables" return [train_x, train_y, test_x, test_y, dev_x, dev_y] if USE_CACHING: try: # try to load from serialized filed, beware with open( prefix_path + 'train_x_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'rb') as f: train_x = np.load(f) with open( prefix_path + 'train_y_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'rb') as f: train_y = np.load(f) with open( prefix_path + 'test_x_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'rb') as f: test_x = np.load(f) with open( prefix_path + 'test_y_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'rb') as f: test_y = np.load(f) if cv_frac == 'fixed': with open( prefix_path + 'dev_x_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'rb') as f: dev_x = np.load(f) with open( prefix_path + 'dev_y_' + dataset_name + '_' + features + str(nframes) + suffix + '.npy', 'rb') as f: dev_y = np.load(f) except: # do the whole preparation (normalization / padding) print "doing the preparation because no serialized data found" [train_x, train_y, test_x, test_y, dev_x, dev_y] = prep_and_serialize() else: [train_x, train_y, test_x, test_y, dev_x, dev_y] = prep_and_serialize() if cv_frac == 'fixed': X_train = train_x y_train = train_y X_validate = dev_x y_validate = dev_y else: from sklearn import cross_validation X_train, X_validate, y_train, y_validate = cross_validation.train_test_split( train_x, train_y, test_size=cv_frac, random_state=0) if numpy_array_only: train_set_x = X_train train_set_y = np.asarray(y_train, dtype='int32') val_set_x = X_validate val_set_y = np.asarray(y_validate, dtype='int32') test_set_x = test_x test_set_y = np.asarray(test_y, dtype='int32') else: train_set_x = theano.shared(X_train, borrow=BORROW) train_set_y = theano.shared(np.asarray(y_train, dtype=theano.config.floatX), borrow=BORROW) train_set_y = T.cast(train_set_y, 'int32') val_set_x = theano.shared(X_validate, borrow=BORROW) val_set_y = theano.shared(np.asarray(y_validate, dtype=theano.config.floatX), borrow=BORROW) val_set_y = T.cast(val_set_y, 'int32') test_set_x = theano.shared(test_x, borrow=BORROW) test_set_y = theano.shared(np.asarray(test_y, dtype=theano.config.floatX), borrow=BORROW) test_set_y = T.cast(test_set_y, 'int32') return [(train_set_x, train_set_y), (val_set_x, val_set_y), (test_set_x, test_set_y)]
import matplotlib.pyplot as plt import pandas as pd #Importing the dataset dataset = pd.read_csv('Salary_Data.csv') print(dataset) X = dataset.iloc[:, :-1].values Y = dataset.iloc[:, 1].values #print(X) #print(Y) #Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 1/3, random_state = 0) #print(X_train) #print(X_test) #print(Y_train) #print(Y_test) #Feature Scaling #Most libraries will take care this step #Fitting Simple Linear Regression to the Training set from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, Y_train)
>>> y = df['quality'] >>> regressor = LinearRegression() >>> scores = cross_val_score(regressor, X, y, cv=5) >>> print scores.mean(), scores 0.290041628842 [ 0.13200871 0.31858135 0.34955348 0.369145 0.2809196 ] #Stochastic Gradient Descent >>> import numpy as np >>> from sklearn.datasets import load_boston >>> from sklearn.linear_model import SGDRegressor >>> from sklearn.cross_validation import cross_val_score >>> from sklearn.preprocessing import StandardScaler >>> from sklearn.cross_validation import train_test_split >>> data = load_boston() >>> X_train, X_test, y_train, y_test = train_test_split(data.data, data.target) >>> X_scaler = StandardScaler() >>> y_scaler = StandardScaler() >>> X_train = X_scaler.fit_transform(X_train) >>> y_train = y_scaler.fit_transform(y_train) >>> X_test = X_scaler.transform(X_test) >>> y_test = y_scaler.transform(y_test) >>> regressor = SGDRegressor(loss='squared_loss') >>> scores = cross_val_score(regressor, X_train, y_train, cv=5) >>> print 'Cross validation r-squared scores:', scores >>> print 'Average cross validation r-squared score:', np.mean(scores) >>> regressor.fit_transform(X_train, y_train) >>> print 'Test set r-squared score', regressor.score(X_test, y_test)
dt.fit(training_feature,training_target) print dt.score(training_feature, training_target) print dt.score(test_feature, test_target) print dt.best_estimator_ parameters=[{'n_estimators':values}] dt=grid_search.GridSearchCV(ensemble.ExtraTreesClassifier(),parameters,cv=5,scoring="accuracy",n_jobs=6) dt.fit(training_feature,training_target) print dt.score(training_feature, training_target) print dt.score(test_feature, test_target) print dt.best_estimator_ ''' num=0 while num<10: train,test=train_test_split(x,test_size=int(x.shape[0]*0.15)) feature= train[0:,1:] target= train[0:,0] feature_test=test[0:,1:] target_test= test[0:,0] random_forest(feature, target, feature_test,target_test) num=num+1
count=count+1 count=0 for i in range(0,len(negdataset)): for j in range(i+1,len(negdataset)): negTol[count]=np.abs(negdataset[i]-negdataset[j]) count=count+1 #打乱,构造数据集 np.random.shuffle(posTol) np.random.shuffle(negTol) pos=np.array(posTol[0:poSize]) neg=np.array(posTol[0:neSize]) dataset=np.concatenate((pos,neg),axis=0) #split the dataset and label: X_train, X_test, y_train, y_test = train_test_split(dataset, label, random_state=0) #np.transpose(label) print(mt.sqrt(dataset.shape[1])) lr = LogisticRegression() lr.fit(X_train,y_train) y_pred_class = lr.predict(X_test) # calculate accuracy print ("acc:",metrics.accuracy_score(y_test, y_pred_class)) #计算空准确率 print("null acc:",max(y_test.mean(), 1-y_test.mean())) # 混淆矩阵 print ("混淆矩阵:",metrics.confusion_matrix(y_test, y_pred_class)) #scores = cross_val_score(lr, dataset, label,cv=2,scoring='roc_auc') #result=scores.mean() #print(result) elapsed = (time.clock() - start)
def trainencoder( sources = ("image_vects", "word_vects") , sources_k = ("image_vects_k", "word_vects_k") , batch_size=128 , embedding_dim=300 , n_captions=5 , n_sbu=None , separate_emb=False , test_size=1000 # per dataset , mode='dev' ): if mode=="coco120k+flickr38k": XYsplit_cum = ([], [], [], []) xyloaders = [ "cocoXYFilenames(dataType='train2014')" , "cocoXYFilenames(dataType='val2014')" , "flickrXYFilenames(dataType='8k')" , "flickrXYFilenames(dataType='30k')" ] ntrains = [80000, 40000, 8000, 30000] for xyloader, ntrain in zip(xyloaders, ntrains): X, Y, _ = eval(xyloader) XYsplit = train_test_split(X, Y, train_size=ntrain) for i in range(len(XYsplit)): XYsplit_cum[i].extend(XYsplit[i]) trX, teX, trY, teY = XYsplit_cum else: trX, teX, trY, teY = coco(mode=mode, n_captions=n_captions, test_size=test_size) if n_sbu: sbutrX, sbuteX, sbutrY, sbuteY = sbu(mode=mode, test_size=test_size) pairs = ( (trX, sbutrX) , (teX, sbuteX) , (trY, sbutrY) , (teY, sbuteY) ) for coco_data, sbu_data in pairs: if isinstance(coco_data, list): coco_data.extend(sbu_data) print("n_train: %d" % len(trX)) print("n_test: %d" % len(teX)) # # # # # # # # # # # # Modeling Building # # # # # # # # # # # # s = Encoder( image_feature_dim=4096 , embedding_dim=embedding_dim , biases_init=Constant(0.) , weights_init=Uniform(width=0.08) ) s.initialize() image_vects = tensor.matrix(sources[0]) # named to match the source name word_vects = tensor.tensor3(sources[1]) # named to match the source name image_vects_k = tensor.matrix(sources_k[0]) # named to match the contrastive source name word_vects_k = tensor.tensor3(sources_k[1]) # named to match the contrastive source name # image_vects.tag.test_value = np.zeros((2, 4096), dtype='float32') # word_vects.tag.test_value = np.zeros((2, 15, 50), dtype='float32') # image_vects_k.tag.test_value = np.zeros((2, 4096), dtype='float32') # word_vects_k.tag.test_value = np.zeros((2, 15, 50), dtype='float32') # learned image embedding, learned sentence embedding lim, ls = s.apply(image_vects, word_vects) # learned constrastive im embedding, learned contrastive s embedding lcim, lcs = s.apply(image_vects_k, word_vects_k) # identical cost code thanks to Ryan Kiros # https://github.com/youralien/skip-thoughts/blob/master/eval_rank.py lim = l2norm(lim) lcim = l2norm(lcim) ls = l2norm(ls) lcs = l2norm(lcs) margin = 0.2 # alpha term should not be more than 1 cost_im = margin - (lim * ls).sum(axis=1) + (lim * lcs).sum(axis=1) cost_im = cost_im * (cost_im > 0.) # this is like the max(0, pairwise-ranking-loss) cost_im = cost_im.sum(0) cost_s = margin - (ls * lim).sum(axis=1) + (ls * lcim).sum(axis=1) cost_s = cost_s * (cost_s > 0.) # this is like max(0, pairwise-ranking-loss) cost_s = cost_s.sum(0) cost = cost_im + cost_s cost.name = "pairwise_ranking_loss" # function(s) to produce embedding if separate_emb: img_encoder = theano.function([image_vects], lim) txt_encoder = theano.function([word_vects], ls) f_emb = theano.function([image_vects, word_vects], [lim, ls]) if n_sbu: sbuname = "sbu%d+" % n_sbu else: sbuname = '' name = "%sproject1.%s.jointembedder" % (sbuname, mode) savename = MODEL_FILES_DIR + name def save_function(self): if separate_emb: ModelIO.save( img_encoder , savename + "_Img") ModelIO.save( txt_encoder , savename + "_Txt") ModelIO.save(f_emb, savename) print "Similarity Embedding function(s) saved while training" def rank_function(stream): images, captions, _0, _1 = stream.get_epoch_iterator().next() image_embs, caption_embs = f_emb(images, captions) ModelEval.ImageSentenceRanking(image_embs, caption_embs) def rank_coco(self=None): # Get 1000 images / captions to test rank stream = DataETL.getFinalStream(teX, teY, sources=sources, sources_k=sources_k, batch_size=test_size, shuffle=True) print "COCO test" rank_function(stream) def rank_sbu(self=None): stream = DataETL.getFinalStream(sbuteX, sbuteY, sources=sources, sources_k=sources_k, batch_size=test_size, shuffle=True) print "SBU test" rank_function(stream) def rank_em(self=None): rank_coco() if n_sbu: rank_sbu() cg = ComputationGraph(cost) # # # # # # # # # # # # Modeling Training # # # # # # # # # # # # algorithm = GradientDescent( cost=cost , parameters=cg.parameters , step_rule=Adam(learning_rate=0.0002) ) main_loop = MainLoop( model=Model(cost) , data_stream=DataETL.getFinalStream(trX, trY, sources=sources, sources_k=sources_k, batch_size=batch_size) , algorithm=algorithm , extensions=[ DataStreamMonitoring( [cost] , DataETL.getFinalStream(trX, trY, sources=sources, sources_k=sources_k, batch_size=batch_size, shuffle=True) , prefix='train') , DataStreamMonitoring( [cost] , DataETL.getFinalStream(teX, teY, sources=sources, sources_k=sources_k, batch_size=batch_size, shuffle=True) , prefix='test') , UserFunc(save_function, after_epoch=True) , UserFunc(rank_em, after_epoch=True) , Printing() , LogToFile('logs/%s.csv' % name) ] ) main_loop.run()
import tensorflow as tf from sklearn.datasets import load_digits from sklearn.cross_validation import train_test_split from sklearn.preprocessing import LabelBinarizer digits = load_digits() X = digits.data y = digits.target y = LabelBinarizer().fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) input_size = 64 output_size = 10 norm = False hidden_units1 = 100 hidden_units2 = 50 def fc_layer_without_bn(x_input, num_units, activation, is_training): layer = tf.layers.dense(x_input, num_units, use_bias=False) layer = activation(layer) return layer def compute_accuracy(v_xs, v_ys): global prediction #先把prediction定义为全局变量 y_pre = sess.run(prediction, feed_dict={ xs: v_xs, on_train: False }) #生成预测值(概率),10分类,所以一个样本是10列概率
import pickle import numpy numpy.random.seed(42) ### the words (features) and authors (labels), already largely processed ### these files should have been created from the previous (Lesson 10) mini-project. words_file = "/Users/dadda/Dropbox (MIT)/Online Courses/Intro to ML/ud120-projects-master/text_learning/your_word_data.pkl" authors_file = "/Users/dadda/Dropbox (MIT)/Online Courses/Intro to ML/ud120-projects-master/text_learning/your_email_authors.pkl" word_data = pickle.load(open(words_file, "r")) authors = pickle.load(open(authors_file, "r")) ### test_size is the percentage of events assigned to the test set (remainder go into training) ### feature matrices changed to dense representations for compatibility with classifier ### functions in versions 0.15.2 and earlier from sklearn import cross_validation features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( word_data, authors, test_size=0.1, random_state=42) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test).toarray() ### a classic way to overfit is to use a small number ### of data points and a large number of features ### train on only 150 events to put ourselves in this regime features_train = features_train[:150].toarray() labels_train = labels_train[:150] ### your code goes here
df_y # In[14]: df_x = np.array(df_x) df_y = np.array(df_y) # In[15]: df_x.shape # In[16]: # test train split# test t x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4) # done with preprocessing # In[17]: #CNN model#CNN mod model = Sequential() # 32 filter 3*3 size model.add( Convolution2D(32, 3, data_format='channels_last', activation='relu', input_shape=(28, 28, 1))) # reduce number of parameters by getting imporatant params
from utils.dataloaders import load_wassa, sentence_dataset from utils.early_stopping import Early_stopping from utils.load_embeddings import load_word_vectors from utils.nlp import twitter_preprocessor from utils.training import class_weigths, epoch_summary, save_checkpoint # load dataset config = ConfLangModel dataset = 'emotion2M' name = 'emotion_with_2M' data = sentence_dataset(os.path.join(DATA_DIR, dataset, "emotion_final.txt")) y = np.zeros(len(data)) train_data, val_data, _, _ = train_test_split(data, y, test_size=0.2, random_state=13) # train_data = train_data[:1000] # val_data = val_data[:100] ##################################################################### # Define Dataloaders ##################################################################### # Prosoxh! to emotion dataset einai hdh PREPROCESSED me ekphrasis! # preprocessor = twitter_preprocessor() preprocessor = None if preprocessor is None: train_name = "train_simple_split_{}".format(dataset) val_name = "valid_simple_split_{}".format(dataset) else:
### load up some practice data with outliers in it ages = pickle.load( open("practice_outliers_ages.pkl", "r") ) net_worths = pickle.load( open("practice_outliers_net_worths.pkl", "r") ) ### ages and net_worths need to be reshaped into 2D numpy arrays ### second argument of reshape command is a tuple of integers: (n_rows, n_columns) ### by convention, n_rows is the number of data points ### and n_columns is the number of features ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) from sklearn.cross_validation import train_test_split ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42) ### fill in a regression here! Name the regression object reg so that ### the plotting code below works, and you can see what your regression looks like from sklearn.linear_model import LinearRegression reg = LinearRegression() reg.fit(ages_train,net_worths_train) print 'The Slope Of The Regression Line Is: ',reg.coef_ print 'The Regression Score On Test Data: ', reg.score(ages_test, net_worths_test) try: plt.plot(ages, reg.predict(ages), color="blue") except NameError:
# Provided to give you a starting point. Try a variety of classifiers. from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=300, n_jobs=-1, class_weight='balanced_subsample') ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) clf.fit(features_train, labels_train) pred = clf.predict(features_test) from sklearn.metrics import (recall_score, precision_score) print precision_score(labels_test, pred, average='binary') print recall_score(labels_test, pred, average='binary') ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)
X0[s2, :] = X[i, :] s2 = s2 + 1 Y0 = Y0[0:s2] X0 = X0[0:s2] Y1 = Y1[0:s1] X1 = X1[0:s1] #Ensure the same proportion as positives and negatives categories total = len(Y0) partial = len(Y1) prop = partial / total ## Shuffling the data X_use, X_discart, Y_use, Y_discart = train_test_split( X0, Y0, \ test_size = 0.3, \ train_size = prop, \ random_state = 90) ## Reconstruct same proportion data--set X_data = np.concatenate([X1, X_use]) Y_data = np.concatenate([Y1, Y_use]) ## Normalization for i in range(7): x_max = max(X_data[:, i]) x_min = min(X_data[:, i]) k = float(1 / (x_max - x_min)) X_data[:, i] = (X_data[:, i] - x_min) * k # Data prep to trainning such as test prop = 12000 / len(Y_data)
def PCAPlot(): import io, sys sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') tagset = set([]) tags = ["i2vtags", "mstags", "gotags"] # tags = ["gotags"] for tag in tags: for item in anime: for st in item[tag]: t = tag + "_" + st[0].replace(" ", "_") tagset.add(t) for item in jpop: for st in item[tag]: t = tag + "_" + st[0].replace(" ", "_") tagset.add(t) idtag = list(tagset) idtag.sort() idtag = ["anime/jpop"] + idtag tagid = {} for id, tag in enumerate(idtag): tagid[tag] = id feature = np.zeros((len(jpop) * 2, len(idtag) - 1)) cnt = 0 for item in anime[:len(jpop)]: for tag in tags: for st in item[tag]: t = tag + "_" + st[0].replace(" ", "_") feature[cnt][tagid[t] - 1] = st[1] cnt += 1 for item in jpop: for tag in tags: for st in item[tag]: t = tag + "_" + st[0].replace(" ", "_") feature[cnt][tagid[t] - 1] = st[1] cnt += 1 from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis pca = PCA(n_components=2) xtr = pca.fit_transform(feature) plt.scatter(xtr[:len(jpop), 0], xtr[:len(jpop), 1], color="red", label="anime") plt.scatter(xtr[len(jpop):, 0], xtr[len(jpop):, 1], color="blue", label="jpop") plt.legend() plt.savefig("pca.png") plt.show() target = [0] * len(jpop) + [1] * len(jpop) xtr1, xte1, ytr1, yte1 = train_test_split(feature[:len(jpop)], [0] * len(jpop), test_size=0.2) xtr2, xte2, ytr2, yte2 = train_test_split(feature[len(jpop):], [1] * len(jpop), test_size=0.2) xtr = list(xtr1) + list(xtr2) xte = list(xte1) + list(xte2) ytr = list(ytr1) + list(ytr2) yte = list(yte1) + list(yte2) lda = LinearDiscriminantAnalysis() ytrp = lda.fit_transform(xtr, ytr) ytep = lda.transform(xte) print(lda.score(xtr, ytr), lda.score(xte, yte)) plt.subplot(2, 1, 1) plt.hist(ytrp[:len(ytrp) / 2], normed=True, bins=50, alpha=0.3, label="anime", color="red") plt.hist(ytrp[len(ytrp) / 2:], normed=True, bins=50, alpha=0.3, label="jpop", color="blue") plt.xlabel("train") plt.legend() plt.subplot(2, 1, 2) plt.hist(ytep[:len(ytep) / 2], normed=True, bins=50, range=(-20, 20), alpha=0.3, label="anime", color="red") plt.hist(ytep[len(ytep) / 2:], normed=True, bins=50, range=(-20, 20), alpha=0.3, label="jpop", color="blue") plt.xlabel("test") plt.legend() plt.savefig("lda.png") plt.show()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Jun 17 20:22:17 2017 @author: rishi """ #just copy and paste the content and change the vairable indexs as per your data import numpy as np; import matplotlib.pyplot as plt; import pandas as pd; dataset=pd.read_csv('Data.csv') x=dataset.iloc[:,:-1].values y=dataset.iloc[:,3].values from sklearn.cross_validation import train_test_split xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state= 0)
def makeCorpus(): import io, sys sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') corp_anime = [] corp_jpop = [] tagset = set([]) tag = "gotags" for item in anime: if (len(item[tag]) > 0): pack = [] for st in item[tag]: st = st[0].replace(" ", "") tagset.add(st) pack.append(st) corp_anime.append(" ".join(pack)) for item in jpop: if (len(item[tag]) > 0): pack = [] for st in item[tag]: st = st[0].replace(" ", "") tagset.add(st) pack.append(st) corp_jpop.append(" ".join(pack)) trf = TfidfVectorizer(max_df=0.9, min_df=5) trf.fit(corp_anime + corp_jpop) xa = trf.transform(corp_anime).toarray() xj = trf.transform(corp_jpop).toarray() xjtr, xjte = train_test_split(xj, test_size=0.2) xatr, xate = train_test_split(xa, train_size=xjtr.shape[0], test_size=xjte.shape[0]) voc = trf.vocabulary_ # print(len(voc)) # print(voc) xtr = np.vstack((xatr, xjtr)) # print(xtr.shape,xatr.shape,xjtr.shape) xte = np.vstack((xate, xjte)) # print(xte.shape,xate.shape,xjte.shape) ytr = [0] * xatr.shape[0] + [1] * xjtr.shape[0] yte = [0] * xate.shape[0] + [1] * xjte.shape[0] from sklearn.ensemble import RandomForestClassifier from sklearn import svm import xgboost as xgb from sklearn.grid_search import GridSearchCV param = {"n_estimators": list(range(25, 35, 1)), "max_depth": [2]} rf = RandomForestClassifier() ''' param = { "learning_rate" : np.linspace(0.1,0.2,1), "n_estimators" : np.arange(500,600,300), "min_child_weight" : np.arange(1,2,2), "max_depth" : np.arange(3,4,12), "gamma" : np.linspace(0.1,0.2,1), "subsample" : np.linspace(0.8,0.9,1), "colsample_bytree" : np.linspace(0.8,0.9,1) } print(param) rf = xgb.XGBClassifier() ''' grd = GridSearchCV(rf, param) grd.fit(xtr, ytr) clf = grd.best_estimator_ imp = clf.feature_importances_ imps = [] for ind, inv in enumerate(imp): imps.append([inv, ind]) imps.sort() imps.reverse() fout = open("importance.txt", "w") for item in imps: key = [key for key, value in voc.items() if value == item[1]][0] fout.write("{0} {1}\n".format(key, item[0])) fout.close() train_sc = clf.score(xtr, ytr) test_sc = clf.score(xte, yte) print(train_sc, test_sc) print(grd.best_params_)
@author: Hardikk Madaan """ #LEARNING LOSITICS REGRESSION import pandas as pd import numpy as np import matplotlib.pyplot as plt dataset = pd.read_csv("Social_Network_Ads.csv") features = dataset.iloc[:, [2, 3]].values labels = dataset.iloc[:, 4].values #SPLITTING from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.25, random_state=0) #FEATURE SCALING from sklearn.preprocessing import StandardScaler sc = StandardScaler() features_train = sc.fit_transform(features_train) features_test = sc.transform(features_test) #FITTING LOGISTIC REGRESSION INTO TRAINING SET from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(features_train, labels_train) #PREDICTING THE RESULTS labels_pred = classifier.predict(features_test)
def form_post(): global team1 global team2 team1 = request.form['sel1'] team2 = request.form['sel2'] if team1 != '': data = pd.read_csv('final_final_dataset.csv') data = data[data.MW > 3] teamname = team1 data.drop(['Unnamed: 0','HomeTeam', 'AwayTeam', 'Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG', 'HTGS', 'ATGS', 'HTGC', 'ATGC','HomeTeamLP', 'AwayTeamLP','DiffPts','HTFormPts','ATFormPts', 'HM4','HM5','AM4','AM5','HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5', 'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3'],1, inplace=True) # Separate into feature set and target variable X_all = data.drop(['FTR'],1) y_all = data['FTR'] cols = [['HTGD','ATGD','HTP','ATP','DiffLP']] for col in cols: X_all[col] = scale(X_all[col]) X_all.HM1 = X_all.HM1.astype('str') X_all.HM2 = X_all.HM2.astype('str') X_all.HM3 = X_all.HM3.astype('str') X_all.AM1 = X_all.AM1.astype('str') X_all.AM2 = X_all.AM2.astype('str') X_all.AM3 = X_all.AM3.astype('str') def preprocess_features(X): ''' Preprocesses the football data and converts catagorical variables into dummy variables. ''' # Initialize new output DataFrame output = pd.DataFrame(index = X.index) # Investigate each feature column for the data for col, col_data in X.iteritems(): # If data type is categorical, convert to dummy variables if col_data.dtype == object: col_data = pd.get_dummies(col_data, prefix = col) # Collect the revised columns output = output.join(col_data) return output X_all = preprocess_features(X_all) print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns))) X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 50, random_state = 2, stratify = y_all) def predict_labels(clf, features, target): ''' Makes predictions using a fit classifier based on F1 score. ''' # Start the clock, make predictions, then stop the clock start = time() print("-------------") print(type(features)) print("---------------") y_pred = clf.predict(features) end = time() # Print and return results print("Made predictions in {:.4f} seconds.".format(end - start)) return f1_score(target, y_pred, labels=['A','D','H'],average='micro'), sum(target == y_pred) / float(len(y_pred)) # # TODO: Initialize the classifier f1_scorer = make_scorer(f1_score,labels=['A','D','H'],average='micro') parameters = { 'learning_rate' : [0.1], 'n_estimators' : [40], 'max_depth': [3], 'min_child_weight': [3], 'gamma':[0.4], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'scale_pos_weight' : [1], 'reg_alpha':[1e-5] } #clf.fit(X_train, y_train) logistic = LogisticRegression(random_state=42) svm = SVC(random_state=912, kernel='rbf') logistic.fit(X_train,y_train) f1, acc = predict_labels(logistic,X_test,y_test) print("Logistic Regression --> final F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc)) svm.fit(X_train,y_train) f1, acc = predict_labels(svm,X_test,y_test) print("SVM --> final F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc)) clf = xgb.XGBClassifier(seed=2) # # TODO: Perform grid search on the classifier using the f1_scorer as the scoring method grid_obj = GridSearchCV(clf, scoring=f1_scorer, param_grid=parameters, cv=5) # # TODO: Fit the grid search object to the training data and find the optimal parameters grid_obj = grid_obj.fit(X_all,y_all) # # Get the estimator clf = grid_obj.best_estimator_ #print(clf) # # Report the final F1 score for training and testing after parameter tuning f1, acc = predict_labels(clf, X_train, y_train) print("final F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc)) f1, acc = predict_labels(clf, X_test, y_test) print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc)) data2 = pd.read_csv('team_dataframe.csv') data2 = data2.iloc[30:] global teamindex teamindex = 122 for index, row in data2.iterrows(): if teamname == row['HomeTeam']: teamindex = index #print(type(X_all.loc[x].to_frame().T)) #print(X_all.loc[x].to_frame().T) winnerlist = clf.predict(X_all.loc[teamindex].to_frame().T) print(winnerlist) global teamwin global hnh teamwin = winnerlist[0] if teamwin == 'A': teamwin = team2 hnh = "AwayTeam" elif teamwin == 'H': teamwin = team1 hnh = "HomeTeam" else: teamwin = "DRAW!" hnh = "The game will be a DRAW" print(teamwin) else: print(team1+" "+team2) return render_template('index.html', text=teamwin,bleh=team2,blehh=hnh)
def linearSep(): import io, sys sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') corp_anime = [] corp_jpop = [] print(anime[0]) tagset = set([]) tag = "i2vtags" for item in anime: if (len(item[tag]) > 0): pack = [] for st in item[tag]: st = st[0].replace(" ", "") tagset.add(st) pack.append(st) corp_anime.append(" ".join(pack)) for item in jpop: if (len(item[tag]) > 0): pack = [] for st in item[tag]: st = st[0].replace(" ", "") tagset.add(st) pack.append(st) corp_jpop.append(" ".join(pack)) trf = TfidfVectorizer(max_df=1.0, min_df=10) trf.fit(corp_anime + corp_jpop) xa = trf.transform(corp_anime).toarray() xj = trf.transform(corp_jpop).toarray() xjtr, xjte = train_test_split(xj, test_size=0.2) xatr, xate = train_test_split(xa, train_size=xjtr.shape[0], test_size=xjte.shape[0]) voc = trf.vocabulary_ xtr = np.vstack((xatr, xjtr)) xte = np.vstack((xate, xjte)) ytr = [0] * xatr.shape[0] + [1] * xjtr.shape[0] yte = [0] * xate.shape[0] + [1] * xjte.shape[0] from sklearn import svm from sklearn.feature_selection import RFE, RFECV param = {"C": [0.01, 0.1, 1, 10, 100, 1000]} rf = svm.LinearSVC() rfe = RFE(estimator=rf, n_features_to_select=10000, step=10) rfe.fit(xtr, ytr) xtrt = rfe.transform(xtr) xtet = rfe.transform(xte) rf.fit(xtrt, ytr) print(rf.score(xtrt, ytr)) print(rf.score(xtet, yte)) supIndex = rfe.transform(list(range(len(xtr[0]))))[0] def getIdKey(id): return [key for key, value in voc.items() if value == id][0] feats = [[rf.coef_[0][i], getIdKey(v)] for i, v in enumerate(supIndex)] feats.sort() print("\n".join(list(map(str, feats[0:5])))) feats.reverse() print("\n".join(list(map(str, feats[0:5]))))
#Dataset provided by SuperDataScience.com #import libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt #import dataset and declare x & y variables dataset = pd.read_csv('Social_Network_Ads.csv') x = dataset.iloc[:, [2, 3]].values y = dataset.iloc[:, 4].values #split the dataset into the training and test sets from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=0) #feature scaling (not necessary for decision trees, but helps when visualizing the data) from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) #fitting classifier to training set from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0) classifier.fit(x_train, y_train)
train_column = [ 'Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10' ] X = d[train_column] Y = d[target_column] y_true = d[target_column] a = np.array([100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]) for i in a: #print(i) XX = X.iloc[0:i] #clustering loop for X YY = Y.iloc[0:i] X_train, X_test, Y_train, Y_test = train_test_split(XX, YY, test_size=0.30, random_state=20) #print(XX) #print(YY) clf = linear_model.LogisticRegression() clf.fit(X_train, Y_train) #print(clf.predict(XX)) y_pred = clf.predict(X_test) #print(clf.score(XX,YY)) AA = accuracy_score(Y_test, y_pred) print('Accuracy Score:') print(AA) BB = f1_score(Y_test, y_pred, average='micro') print('F1 Score')
one_hot_encoder = enc.fit(integer_classes) # First, convert classes to 0-(N-1) integers using label_encoder num_of_rows = titanic_X.shape[0] t = label_encoder.transform(titanic_X[:, 0]).reshape(num_of_rows, 1) # Second, create a sparse matrix with three columns, each one indicating if the instance belongs to the class new_features = one_hot_encoder.transform(t) # Add the new features to titanix_X titanic_X = np.concatenate([titanic_X, new_features.toarray()], axis=1) # Eliminate converted columns titanic_X = np.delete(titanic_X, [0], 1) # Update feature names feature_names = ['age', 'sex', 'first_class', 'second_class', 'third_class'] # Convert to numerical values titanic_X = titanic_X.astype(float) titanic_y = titanic_y.astype(float) ## Check print(feature_names) print(titanic_X[0], titanic_y[0]) ## Holdout X_train, X_test, y_train, y_test = train_test_split(titanic_X, titanic_y, test_size=0.25, random_state=33)
test[features] = scl.transform(test[features]) params = {"objective": "reg:linear", "eta": 0.3, "max_depth": 8, "subsample": 0.7, "colsample_bytree": 0.7, "silent": 1 } num_trees = 300 print("Train a XGBoost model") val_size = 100000 #train = train.sort(['Date']) print(train.tail(1)['Date']) X_train, X_test = cross_validation.train_test_split(train, test_size=0.01) #X_train, X_test = train.head(len(train) - val_size), train.tail(val_size) dtrain = xgb.DMatrix(X_train[features], np.log(X_train["Sales"] + 1)) dvalid = xgb.DMatrix(X_test[features], np.log(X_test["Sales"] + 1)) dtest = xgb.DMatrix(test[features]) watchlist = [(dvalid, 'eval'), (dtrain, 'train')] gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True) print("Validating") train_probs = gbm.predict(xgb.DMatrix(X_test[features])) indices = train_probs < 0 train_probs[indices] = 0 error = rmspe(np.exp(train_probs) - 1, X_test['Sales'].values) print('error', error) print("Make predictions on the test set")