Exemplo n.º 1
0
def train_test_split(model_df, percent, num_bootstraps):
    '''
	Splits the model dataset into the required train and test data sets
	Use 'percent' to first split train and test
	Then use the train data set to understand how much churned subs are
		present
	Split unchurned subs into multiple random selections equivalent in 
		size to the churned subs set
	Return each dataset as train set in a dictionary
	'''
    print('Entered train test split')
    print model_df.head()
    print model_df['customer_life']
    model_df['flag'] = 0
    model_df['flag'][(model_df['customer_life'] < 60)] = 1
    model_df = model_df[model_df['flag'] == 0]
    col = model_df.columns.tolist()
    col.remove('flag')
    model_df = model_df[col].copy()
    master_train, test_data = tts(
        model_df, test_size=percent
    )  #Master train = 80% of data, Therefore PERCENT = 0.2
    train_churn = master_train[master_train['churn_flag'] == 1]  #
    train_uchurn = master_train[master_train['churn_flag'] == 0]
    print len(train_churn)
    print len(train_uchurn)
    train_subsample_size = int(len(train_churn) * 0.8)
    sub_uchurn_percent = float(train_subsample_size * 9) / float(
        len(train_uchurn))
    test_size = sub_uchurn_percent

    train_indep_dsamples = {}
    train_dep_dsamples = {}

    print test_size
    for i in range(num_bootstraps):
        print(str(i))
        dummy, down_train_uchurn = tts(train_uchurn, test_size=test_size)
        dummy, down_train_churn = tts(train_churn, test_size=0.8)
        indep_columns = down_train_churn.columns.tolist()
        indep_columns.remove('churn_flag')
        dep_columns = ['churn_flag']
        indep_set = pd.concat([
            down_train_uchurn[indep_columns], down_train_churn[indep_columns]
        ])
        dep_set = pd.concat(
            [down_train_uchurn[dep_columns], down_train_churn[dep_columns]])
        print len(indep_set)
        print len(dep_set)
        train_indep_dsamples[i] = indep_set
        train_dep_dsamples[i] = dep_set

    return_dict = {
        'test_set': test_data,
        'train_indep': train_indep_dsamples,
        'train_dep': train_dep_dsamples,
        'master_train': master_train
    }

    return return_dict
Exemplo n.º 2
0
def build_and_evaluate(X, y, classifier=svm.SVC, verbose=True):
    def build(classifier, X, y=None):
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            (
                'union',
                FeatureUnion(transformer_list=[
                    (
                        'bag_words',
                        Pipeline([
                            ('preprocessor', NLTKPreprocessor()),
                            #('tfidf', TfidfVectorizer(ngram_range=(1, 2), tokenizer=identity, preprocessor=None, lowercase=False)),
                            #('tfidf', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words='english')),
                            (
                                'topics_and_ngrams',
                                FeatureUnion(transformer_list=[
                                    ('grams',
                                     Pipeline([(
                                         'ngram',
                                         TfidfVectorizer(ngram_range=(1, 2),
                                                         tokenizer=identity,
                                                         preprocessor=None,
                                                         lowercase=False)
                                     ), ('best',
                                         TruncatedSVD(n_components=50))])),
                                    #('topics', Pipeline([
                                    #	('tfid', TfidfVectorizer(ngram_range=(1, 1), tokenizer=identity, preprocessor=None, lowercase=False)),
                                    #	('topic', NMF(n_components=9, random_state=1,
                                    #	alpha=.1, l1_ratio=.5)),
                                    #	])),
                                ])),
                        ])),
                    # add other features here as an element in transformer list
                    ('capitalize',
                     Pipeline([('cap_words', CaptilizationExtractor())])),
                    ('punctuation', PuncuationExtractor())
                    #('emotion', Pipeline([
                    # ('emotion_words', EmotionExtractor())
                    #]))
                ])),
            ('svc', svm.SVC()),
        ])
        model.fit(X, y)
        return model

    labels = LabelEncoder()
    y = labels.fit_transform(y)

    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model = build(classifier, X_train, y_train)

    if verbose:

        print("classification Report: \n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred))
    def Run(self, csv):
        dataset = pd.read_csv(StringIO(csv), delimiter=';')
        x = dataset.iloc[:, 0:1].values
        y = dataset.iloc[:, 1].values

        #split base into train and test
        from sklearn.cross_validation import train_test_split as tts
        x_train, x_test, y_train, y_test = tts(x,
                                               y,
                                               test_size=0.2,
                                               random_state=0)

        #fit the regression
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(x_train, y_train)

        #regression
        y_pred = regressor.predict(x_test)

        result = []
        for i in range(0, len(y_pred)):
            result.append({
                'Test': x_test[i][0],
                'Expected': y_test[i],
                'Predicted': y_pred[i],
            })

        print(result)

        return result
    def Run(self, csv):
        dataset = pd.read_csv(StringIO(csv))
        x = dataset.iloc[:, 0:1].values
        y = dataset.iloc[:, 1].values

        from sklearn.cross_validation import train_test_split as tts
        x_train, x_test, y_train, y_test = tts(x,
                                               y,
                                               test_size=0.2,
                                               random_state=0)

        from sklearn.linear_model import LinearRegression
        from sklearn.preprocessing import PolynomialFeatures

        feature_poly = PolynomialFeatures(degree=4)
        x_poly = feature_poly.fit_transform(x_train)

        pr = LinearRegression()
        pr.fit(x_poly, y_train)

        y_pred = pr.predict(feature_poly.fit_transform(x_test))

        result = []
        for i in range(0, len(y_pred)):
            result.append({
                'Expected': x_test.tolist()[i][0],
                'Preditect': y_pred[i],
            })

        print(result)

        return result
def fit_model(X, y):
    Xtr, Xts, ytr, yts = tts(X, y, test_size=1 / 6, random_state=0)
    svc.fit(Xtr, ytr)
    yhat_ts = svc.predict(Xts)
    acc = np.mean(yhat_ts == yts)
    print('Accuaracy = {0:f}'.format(acc))
    return acc
Exemplo n.º 6
0
def plot_roc_curve(estimators, X, y):
    try:
        if type(estimators) is not type([]):
            estimators = [estimators]

        X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=5557)
        for i, clf in enumerate(estimators):
            name = clf.__class__.__name__
            clf.fit(X_train, y_train)
            if 'predict_proba' in dir(clf):
                y_probas = clf.predict_proba(X_test)[:,1]
            elif 'decision_function' in dir(clf):
                y_probas = clf.decision_function(X_test)
            else:
                print('Probability score not available in {}, skipping.'.format(name))
                continue
            fpr, tpr, thresholds = metrics.roc_curve(y_test, y_probas, drop_intermediate=True)
            plt.plot(fpr, tpr, label=name)

        plt.title('ROC Comparison'.format(name))
        plt.xlim(-0.05, 1.05)
        plt.ylim(-0.05, 1.05)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.savefig('aggregate_roccurve.png')
        plt.clf()
    except Exception as e:
        print(e)
Exemplo n.º 7
0
 def train(self):
     self.configs['text1'].delete('1.0', END)
     try:
         self.LIST
     except:
         self.configs['text1'].insert(
             "1.0", 'The entities have not been initialized!')
         return
     x_train_o, x_test_o = tts(self.LIST, test_size=0.2)
     x_train = np.array([model[i] for i in x_train_o])
     x_test = np.array([model[i] for i in x_test_o])
     train_model = get_model(x_train, self.ini)
     y_pred = train_model.predict(x_test)
     labels = sorted(set(y_pred))
     most = [sum(y_pred == i) for i in labels]
     if len(most) > 1:
         arg_outlier = np.argmin(most)
         outliers = x_test_o[y_pred == labels[arg_outlier]]
         self.outliers[self.ini] = outliers
     most = max(most)
     ACC = most * 1.0 / len(y_pred)
     self.ACC[self.ini] = ACC
     self.trained[self.ini] = True
     self.configs['text1'].insert(
         "1.0", 'Type of classifier: ' + names[self.ini] +
         '\n The ACC is:\n' + str(ACC))
Exemplo n.º 8
0
def train_split(data, outcome, predictors, ratio=0.3):

    x_train, x_test, l_train, l_test = tts(data[predictors],
                                           data[outcome],
                                           test_size=ratio,
                                           random_state=123)
    return x_train, x_test, l_train, l_test
def buildnEvaluateModel(X, y):
    '''
    The function takes training data and splits it further into
    Training and Cross-validate sets. And returns the model.
    '''
    # Split the traning data input to get 20% cross-validation data set
    # for model evaluation
    X_train, X_cv, y_train, y_cv = tts(X, y, test_size=0.2)

    #convert dataframe with float valaues into bool
    y_train = [bool(int(i)) for i in y_train]
    y_cv = [bool(int(i)) for i in y_cv]

    #output classification labels
    labels = LabelEncoder()
    labels.fit_transform(y_train)

    # define classification model
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SVC(kernel='linear', probability=True)),
    ])

    #Traning the model
    text_clf = text_clf.fit(X_train, y_train)
    '''
    Following section evaluates the model performance
    '''
    predicted = text_clf.predict(X_cv)
    print("Model Accuracy = " + str(np.mean(predicted == y_cv)))
    print(clsr(y_cv, predicted,
               target_names=[str(i) for i in labels.classes_]))

    return text_clf
Exemplo n.º 10
0
def build_and_evaluate(X,
                       y,
                       classifier=SGDClassifier,
                       outpath=None,
                       verbose=True):
    # @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    secs = time()

    # Begin evaluation
    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model = build(classifier, X_train, y_train)

    if verbose:
        print("Evaluation model fit in {:0.3f} seconds".format(time() - secs))
        print("Classification Report:\n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    secs = time()
    if verbose:
        print("Building complete model and saving ...")
    model = build(classifier, X, y)
    model.labels_ = labels

    if verbose:
        print("Complete model fit in {:0.3f} seconds".format(time() - secs))

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model
def prepare_dataset(corpus, labels, test_data_proportion=0.3):
    '''
    creates a train and test split of calssification dataset
    '''
    train_x, test_x, train_y, test_y = tts(corpus,
                                           labels,
                                           test_size=0.3,
                                           random_state=42)
    return train_x, test_x, train_y, test_y
Exemplo n.º 12
0
def train_subset(x_train, y_train, x_test, porc_corte, pipeline):
    x_train_1, x_test_1, y_train_1, y_test_1 = tts(x_train,
                                                   y_train,
                                                   random_state=0,
                                                   test_size=porc_corte)
    pipeline.fit(x_train_1, y_train_1)

    print "Predict!!"
    return pipeline.predict(x_test)
Exemplo n.º 13
0
 def test_resid_plots(self):
     """
     Assert no errors occur during Residual Plots integration
     """
     model = SVR()
     X_train, X_test, y_train, y_test = tts(X, y, test_size=0.5)
     model.fit(X_train, y_train)
     visualizer = ResidualsPlot(model)
     visualizer.score(X_test, y_test)
Exemplo n.º 14
0
 def __init__(self, iris):
     #setting the train and test data and targets
     self.iris = iris
     self.d_train, self.d_test, self.t_train, self.t_test = tts(
         iris.data,
         iris.target,
         train_size=.7,
         random_state=random.randint(400, 600))
     self.prediction = []
     self.percent = 0
def data_preparation(x):
    #again and again so make a function
    x_features=x.iloc[:,x.columns!="Class"]
    x_labels=x.iloc[:,x.columns=="Class"]
    x_features_train,x_features_test,x_labels_train,x_labels_test=tts(x_features,x_labels,test_size=0.3)
    print("length of training data")
    print(len(x_features_train))
    print("length of test data")
    print(len(x_features_test))
    return(x_features_train,x_features_test,x_labels_train,x_labels_test)
Exemplo n.º 16
0
 def __init__(self, location, split=0.2):
     self.location = location
     datas1 = pd.read_csv(location)
     self.x = datas1.iloc[:, :-1].values
     self.y = datas1.iloc[:, -1].values
     self.xtr, self.xte, self.ytr, self.yte = tts(self.x,
                                                  self.y,
                                                  test_size=split)
     self.t = 0
     self.tt = self.t + 1
Exemplo n.º 17
0
def run_svmtest_int(num):
    n = 0
    l = []
    for n in range(num):
        I_train, I_test, y2_train, y2_test = tts(I, y2, test_size=.1)
        my_c1 = svm.SVC()
        my_c1.fit(I_train.values.reshape(-1, 1), y2_train)
        predictions1 = my_c1.predict(I_test.values.reshape(-1, 1))
        score = accuracy_score(y2_test, predictions1)
        l.append(score)
        n += 1
    return l
Exemplo n.º 18
0
    def fit(self, X, y):
        """
        Fit all three models and also store the train/test splits.

        TODO: move to MultiModelMixin.
        """
        # TODO: make test size a parameter and do better data storage on viz.
        self.X_train, self.X_test, self.y_train, self.y_test = tts(
            X, y, test_size=0.2)
        self.models = list(
            map(lambda model: model.fit(self.X_train, self.y_train),
                self.models))
Exemplo n.º 19
0
def main():
    kfold = KFold(len(yall), 10)
    sen = []
    spe = []
    acc = []
    mcc = []
    figs = []
    #set the params of SVM
    C = np.linspace(0.6, 0.8, 10)
    G = np.linspace(0.13, 0.22, 10)
    clist = []
    glist = []
    aucs = []
    param = {'C': C, 'gamma': G}
    for ind1, ind2 in kfold:
        print('*********')
        x_train = trall[ind1]
        y_train = yall[ind1]
        X_p = x_train[y_train == 1]
        X_n = x_train[y_train == 0]
        Table = frequences_matrix_mainFunc(X_p, X_n)
        x_train, y_train = GetFeatures(x_train, y_train, Table)
        x_test = trall[ind2]
        y_test = yall[ind2]
        x_test, y_test = GetFeatures(x_test, y_test, Table)
        svm = SVC(kernel='rbf', probability=True)
        x1, x2, y1, y2 = tts(x_train, y_train, test_size=0.2)
        cv = CV(svm, param, n_jobs=2)
        cv.fit(x2, y2)
        best = cv.best_params_
        c = best['C']
        g = best['gamma']
        clist.append(c)
        glist.append(g)
        print('c,g:', c, g)
        svm = SVC(kernel='rbf', C=c, gamma=g, probability=True)
        svm.fit(x_train, y_train)
        acc_r = svm.score(x_test, y_test)
        mcc_r, sen_r, spe_r = getmcc2(svm, x_test, y_test)
        acc.append(acc_r)
        mcc.append(mcc_r)
        sen.append(sen_r)
        spe.append(spe_r)
        scores = svm.predict_proba(x_test)[:, 1]
        fpr, tpr, thres = roc_curve(y_test, scores)
        figs.append([fpr, tpr])
        #print('sen:',sen_r,'\n','spe:',spe_r)
        auc_r = auc(fpr, tpr)
        aucs.append(auc_r)
        print(auc_r)
        print('acc:', acc_r, '\n', 'mcc:', mcc_r)
        print('*********')
    return mcc, acc, aucs, sen, spe, figs
Exemplo n.º 20
0
def run_treetest_f1(num):
    n = 0
    l = []
    for n in range(num):
        V_train, V_test, y_train, y_test = tts(V, y, test_size=.1)
        my_c1 = tree.DecisionTreeClassifier()
        my_c1.fit(V_train.values.reshape(-1, 1), y_train)
        predictions1 = my_c1.predict(V_test.values.reshape(-1, 1))
        score = accuracy_score(y_test, predictions1)
        l.append(score)
        n += 1
    return l
Exemplo n.º 21
0
def classifier():
    vect,voc,txt=jiebaCounter()
    # normalisation
    x=np.array(vect/(np.max(vect,axis=1)+1e-10))
    x_train,x_test,y_train,y_test=tts(x,y,test_size=0.25,train_size=0.75)
    clf=svm.LinearSVC()
    clf.fit(x_train,y_train)
    Cs=np.logspace(-5,0,10)
    clf_ = GridSearchCV(estimator=clf, param_grid=dict(C=Cs))
    clf_.fit(x_,y)
    print(clf_.best_params_)
    print("train accuracy:")
    print(np.sum(clf_.predict(x_train)==y_train)/float(len(y_train)))
    print("test accuracy:")
    print(np.sum(clf_.predict(x_test)==y_test)/float(len(y_test)))
Exemplo n.º 22
0
def train(x_dataset, y_dataset, test_size=.33):
    x_train, x_test, y_train, y_test = tts(x_dataset,y_dataset, test_size=test_size)

    lr = LinearRegression()
    lr.fit(x_train, y_train)
    predict = lr.predict(x_test)


    result = []
    index = 0
    for y_item in y_test.values:
        predicted_item = predict[index]
        index += 1
        result.append((float(y_item), float(predicted_item)))
    
    return result, y_test.values, predict
Exemplo n.º 23
0
def main():

	#df = pd.read_csv('../data/seeds.data',error_bad_lines = False,sep = '\t')
	#df.columns=['area','perimeter','compactness','k_length','k_width','assy_coef','g_length','label']

	df = pd.read_csv('../data/alabone.data',header = 0,error_bad_lines = False)

	tar = df['label']

	df = df.drop(['c1','label'],axis=1)
	# Q1 split 50-50%
	rk = {}
	rk[1] = []
	rk[2] = []
	rk[3] = []
	for i in range(0,10):
		print 'Test run',i
		xtrain,xtest,ytrain,ytest = tts(df,tar,test_size = 0.5)
 		rk[1].append(results(xtrain,xtest,ytrain,ytest,k=1))
 		print
 		rk[2].append(results(xtrain,xtest,ytrain,ytest,k=2))
 		print
		rk[3].append(results(xtrain,xtest,ytrain,ytest,k=3))
 			   
 	print "Mean accuracy and variance over 10 runs with k = 1",np.mean(rk[1]),np.var(rk[1])
 	print
 	print "Mean accuracy and variance over 10 runs with k = 2",np.mean(rk[2]),np.var(rk[2])
	print
	print "Mean accuracy and variance over 10 runs with k = 3",np.mean(rk[3]),np.var(rk[3])

	'''
	Cross validation 5 fold
	'''

	sf = StratifiedKFold(tar,n_folds = 5)
	i = 1
	rk[3] = []
	for train,test in sf:
		print 'Fold',i
		i = i +1
		xtrain,xtest,ytrain,ytest = df.values[train],df.values[test],tar.values[train],tar.values[test]
 		print
 		rk[3].append(result(xtrain,xtest,ytrain,ytest,k=3))
 		
 	print	
 	print "Mean accuracy and variance over 5-folds",np.mean(rk[3]),np.var(rk[3])
Exemplo n.º 24
0
def classifier():
    vect, voc, txt = jiebaCounter()
    # normalisation
    x = np.array(vect / (np.max(vect, axis=1) + 1e-10))
    x_train, x_test, y_train, y_test = tts(x,
                                           y,
                                           test_size=0.25,
                                           train_size=0.75)
    clf = svm.LinearSVC()
    clf.fit(x_train, y_train)
    Cs = np.logspace(-5, 0, 10)
    clf_ = GridSearchCV(estimator=clf, param_grid=dict(C=Cs))
    clf_.fit(x_, y)
    print(clf_.best_params_)
    print("train accuracy:")
    print(np.sum(clf_.predict(x_train) == y_train) / float(len(y_train)))
    print("test accuracy:")
    print(np.sum(clf_.predict(x_test) == y_test) / float(len(y_test)))
Exemplo n.º 25
0
def main():

	df = pd.read_csv('../data/iris.data',)
	df.columns=['sepal_l','sepal_w','petal_l','petal_w','label']

	tar = df['label']

	df = df.drop(['label'],axis=1)
	# Q1 split 50-50%
	rk = {}
	rk[1] = []
	rk[2] = []
	rk[3] = []
	for i in range(0,10):
		print 'Test run',i
		xtrain,xtest,ytrain,ytest = tts(df,tar,test_size = 0.5)
 		rk[1].append(results(xtrain,xtest,ytrain,ytest,k=1))
 		print
 		rk[2].append(results(xtrain,xtest,ytrain,ytest,k=2))
 		print
		rk[3].append(results(xtrain,xtest,ytrain,ytest,k=3))
 			   
 	print "Mean accuracy and variance over 10 runs with k = 1",np.mean(rk[1]),np.var(rk[1])
 	print
 	print "Mean accuracy and variance over 10 runs with k = 2",np.mean(rk[2]),np.var(rk[2])
	print
	print "Mean accuracy and variance over 10 runs with k = 3",np.mean(rk[3]),np.var(rk[3])

	'''
	Cross validation 5 fold
	'''

	sf = StratifiedKFold(tar,n_folds = 5)
	i = 1
	rk[3] = []
	for train,test in sf:
		print 'Fold',i
		i = i +1
		xtrain,xtest,ytrain,ytest = df.values[train],df.values[test],tar.values[train],tar.values[test]
 		print
 		rk[3].append(result(xtrain,xtest,ytrain,ytest,k=3))
 		
 	print	
 	print "Mean accuracy and variance over 5-folds",np.mean(rk[3]),np.var(rk[3])
Exemplo n.º 26
0
def build_and_save_model(X, y, filepath):
    """
    This function does the following:
    - Build a classifier (SGD)
    - Fit our data to the classifier
    - Run cross validation to test the accuracy of our model
    """
    def build(classifier, X, y=None):
        """
        Build a model based on our process, a vectorizer and a linear classifier
        """
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            ('preprocessor', DataPreProcessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)  # Fit the model to our data
        return model

    # Label encode the classes we chose
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Split data into train/test
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.1)
    model = build(SGDClassifier, X_train, y_train)

    # Predict the results of test data and calculate accuracy
    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    model.labels_ = labels

    with open(filepath, 'wb') as f:
        pickle.dump(model, f)

    return model
Exemplo n.º 27
0
def tfidf_iterator(batch_size=100,max_features=10000,path="/home/tingyubi/20w/data/",prefix="extraction-",begin=1,end=26):
    #tf,voc,txt = tfidf(max_features=max_features,path=path,prefix=prefix,begin=begin,end=end)
    #jsonfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".json"
    #with open(jsonfile,'r') as f:
    #    data = json.load(f)
    #tf,voc = np.array(data['tfidf']), data['vocabulary']
    pklfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".mat"
    with open(pklfile,'rb') as f:
        tf=cPickle.load(f)
    vocfile = "tfidf_"+prefix+str(begin)+"_"+str(end)+".voc"
    #vocfile = "allvoc.txt"
    f = open(vocfile,'r')
    voc=f.read().decode('utf-8').split("\n")
    f.close()
    tf = tf.toarray()
    tf = tf / (np.max(tf,axis = 1)[:, None] + 1e-10)
    x_train,x_test=tts(tf,train_size=0.9,test_size=0.1)
    train_iter = mx.io.NDArrayIter(data=x_train,batch_size=batch_size,shuffle=True)
    test_iter = mx.io.NDArrayIter(data=x_test,batch_size=batch_size,shuffle=True)
    return train_iter,test_iter,voc
def build_model(X, y, classifier, verbose=True):
    @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model, secs = build(classifier, X_train, y_train)

    if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs))
    if verbose: print("Classification Report:\n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    if verbose: print("Building complete model and saving ...")
    model, secs = build(classifier, X, y)
    model.labels_ = labels.inverse_transform(model.classes_)

    if verbose: print("Complete model fit in {:0.3f} seconds".format(secs))

    return model
def pca_svm(pca_n=10,svm_C=1):  
    t1=time.time()  
    data,target=get_data()  
    #scale_learner=StandardScaler()  
    #data=scale_learner.fit_transform(data)  
    x_train,x_test,y_train,y_test=tts(data,target,random_state=33)  
    pca_learner=decomposition.PCA(n_components=pca_n)  
    x_train=pca_learner.fit_transform(x_train)  
    svm_learner=svm.SVC(C=svm_C)  
    svm_learner.fit(x_train,y_train)  
    x_test_pre=pca_learner.transform(x_test)  
    y_test_pre=svm_learner.predict(x_test_pre)  
    # report=classification_report(y_test,y_test_pre)  
    # print 'The Main Explanied: ',numpy.sum(pca_learner.explained_variance_ratio_)  
    # print report  
    # print x_test_pre.shape,y_test_pre.shape,y_test.shape  
    ac=svm_learner.score(x_test_pre,y_test)  
    p=precision_score(y_test,y_test_pre,average='weighted')  
    r=recall_score(y_test,y_test_pre,average='weighted')  
    f1=2.0/(1.0/p+1.0/r)  
    t=time.time()-t1  
    return ac,p,r,f1,t
    def Run(self, csv):
        dataset = pd.read_csv(StringIO(csv))

        x = dataset.iloc[:, [0, 1]].values
        y = dataset.iloc[:, 2].values

        from sklearn.cross_validation import train_test_split as tts
        x_train, x_test, y_train, y_test = tts(x,
                                               y,
                                               test_size=0.2,
                                               random_state=0)

        from sklearn.preprocessing import StandardScaler
        sc_x = StandardScaler()

        x_train_sc = sc_x.fit_transform(x_train)
        x_test_sc = sc_x.fit_transform(x_test)

        from sklearn.linear_model import LogisticRegression
        #FOR LINEAR LOGISTIC REGRESSION => ONLY TWO OUTPUTS (SIGMOID)
        llr = LogisticRegression(random_state=0)

        llr.fit(x_train_sc, y_train)

        y_pred = llr.predict(x_test_sc)

        result = []
        for i in range(0, len(y_pred)):
            result.append({
                'Age': x_test.tolist()[i][0],
                'Salary': x_test.tolist()[i][1],
                'Expected': y_test.tolist()[i],
                'Preditect': y_pred.tolist()[i],
            })

        print(result)

        return result
def pca_svm_pipeline():  
    #svm_C=numpy.linspace(0.5,10,10)  
    svm_C=[1]  
    pca_n_components=numpy.arange(5,200,10)  
    data,target=get_data()  
    x_train,x_test,y_train,y_test=tts(data,target,random_state=33)  
    #scale_learner=StandardScaler()  
    pca_learner=decomposition.PCA()  
    svm_learner=svm.SVC()  
    pipe=pipeline.Pipeline([('pca',pca_learner),('svm',svm_learner)])  
    gscv=GridSearchCV(pipe,  
                      {'pca__n_components':pca_n_components,'svm__C':svm_C},n_jobs=-1)  
    gscv.fit(x_train,y_train)  
    y_test_pre=gscv.predict(x_test)  
    report=classification_report(y_test,y_test_pre)  
    print(gscv.best_params_ )
    print(report)  
    target_pre=gscv.predict(data)  
    n1,n2=data.shape  
    figure=pyplot.figure()  
    L=numpy.zeros((40,))  
    xx=numpy.linspace(0,1,64)+13  
    yy=numpy.linspace(1,0,64)+13  
    xx,yy=numpy.meshgrid(xx,yy)  
    for i in range(n1):  
        k=target_pre[i]  
        g=L[k]  
        L[k]+=1  
        xx1=xx-k  
        yy1=yy-g  
        pyplot.contourf(xx1,yy1,data[i].reshape((64,64)),cmap='gray')  
        if target[i]!=target_pre[i]:  
            pyplot.scatter(numpy.mean(xx1),numpy.mean(yy1),marker='x',c='red',s=40)  
    pyplot.axis('off')  
    pyplot.grid('off')  
    pyplot.title('PCA & SVM Recongnize Faces')  
    pyplot.show()
Exemplo n.º 32
0
def build_and_evaluate(text, leanings, classifier=SGDClassifier, verbose=True):
    def build(classifier, X, y=None):
        if isinstance(classifier, type):
            classifier = classifier()
        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])
        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    leanings = labels.fit_transform(leanings)

    # Build model on training data.
    text_train, text_test, leanings_train, leanings_test = tts(text,
                                                               leanings,
                                                               test_size=0.2)
    #build(classifier, text_train, leanings_train)

    model = build(classifier, text_train, leanings_train)

    leanings_pred = model.predict(text_test)
    leanings_pred_prob = model.predict_proba(text_test)
    print(clsr(leanings_test, leanings_pred, target_names=labels.classes_))

    # Build model on all data.
    model = build(classifier, text, leanings)
    model.labels_ = labels

    return leanings_test, leanings_pred_prob, model
Exemplo n.º 33
0
    for vid,Xt,yt in zip(subjId_val, X_val, y_val):
	levelOneTest = []
	levelOneTrain = []
	X_levelOne = []
	y_levelOne = []	
	level0Classifier = []
        for tid,Xp,yp in zip(subjId_train,X_train,y_train):
	    print "Predicting subject ", vid, "from subject ", tid
            y0 = np.zeros(yp.shape)
	    y1 = np.ones(Xt.shape[0])
	    X = np.vstack([Xp,Xt])
            yd = np.concatenate([y0,y1])

            pls = PLSRegression(n_components)
	    Xp_t, Xp_v, yp_t, yp_v = tts(Xp.copy(),yp.copy(),train_size=0.9)
	    yp_t = yp_t.astype(bool)
	    yp_t_not =  np.vstack((yp_t,~yp_t)).T
	    #print "yp_t_not ", yp_t_not.shape
	    pls.fit(Xp_t,yp_t_not.astype(int))
	    yp_new = pls.predict(Xp_t, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    yp_t = yp_t.astype(int)
	    #print y_new,y_pred, y_t
	    error = ((yp_t - yp_pred) ** 2).sum()
   	    print "PLS Training error " , float(error)/yp_t.shape[0]
 	    yp_new = pls.predict(Xp_v, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    #print y_new, y_pred, y_v
	    #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0]
	    error = ((yp_v - yp_pred) ** 2).sum()
Exemplo n.º 34
0
x=df.drop('Survived',axis=1)

#makes a baseline for accuracy
float(len(y[y==0]))/float(len(y))

#basic model, no validation
model=lr()
model.fit(x,y)
model.score(x,y)

#looking at correlations
for column in x.columns:
    print column, np.corrcoef(x[column],y)[1][0]

#building test train sets
x_train, x_test, y_train, y_test = tts(x,y, train_size=.8, random_state=1)

#train/test fitting and validation
model.fit(x_train,y_train)
model.score(x_test,y_test)
proba=model.predict_proba(x_test)
pred=model.predict(x_test)
s= cross_val_score(model,x_test,y_test, cv=12)
s.mean()
s.std()

#The f-1 scores show that our model does a fairly decent job of predicting those
#who died and an okay job predicting those who survived
print skcr(y_test,pred)

#(true negative) (false positive)
Exemplo n.º 35
0
Arquivo: script.py Projeto: shawk3/KNN
count = int(input('How many times would you like to run the test: '))
ts = float (input('What test size percentage(eg. 0.25): '))
r = float(input('What learning rate? '))


if dsetindex == 2:
    iris = datasets.load_iris()
    iris.data[: , 0] = do.normalize(iris.data[:,0])
    iris.data[: , 1] = do.normalize(iris.data[:,1])
    iris.data[: , 2] = do.normalize(iris.data[:,2])
    iris.data[: , 3] = do.normalize(iris.data[:,3])

    

    for i in range(count):
        xtrain, xtest, ytrain, ytest = tts(iris.data, iris.target, test_size= ts)
        xtrain, xvalidate, ytrain, yvalidate = tts(xtrain, ytrain, test_size= ts)
        nn = NN.NeuralNetwork(3,4,r)
        nn.addNewLayer(3)
        scores = nn.train(xtrain, ytrain, xvalidate, yvalidate)
        print('Test: ', nn.test(xtest, ytest))

if dsetindex == 1:
    data = np.array(do.read_file("indianDiabetes.txt")).astype(np.float16)
    data[: , 0] = do.normalize(data[:,0])
    data[: , 1] = do.normalize(data[:,1])
    data[: , 2] = do.normalize(data[:,2])
    data[: , 3] = do.normalize(data[:,3])
    data[: , 4] = do.normalize(data[:,4])
    data[: , 5] = do.normalize(data[:,5])
    data[: , 6] = do.normalize(data[:,6])
Exemplo n.º 36
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cross_validation import train_test_split as tts
from sklearn.metrics import confusion_matrix

dataset = pd.read_csv("Social_Network_Ads.csv")
corr = dataset.corr()  #koreleasyon matrix' ine göre cinsiyet anlamsız zaten.

X = dataset.iloc[:, 2:4]
y = dataset.iloc[:, -1]

X_train, X_test, y_train, y_test = tts(X, y, test_size=.2, random_state=0)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

from sklearn.svm import SVC
classifier = SVC(kernel="rbf",
                 random_state=0)  #çıkan değerler sürekli değişmesin diye.
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

#apliying k-fold cross validation
from sklearn.model_selection import cross_val_score
Exemplo n.º 37
0
from sklearn.datasets import fetch_mldata
from sklearn import svm
from sklearn.cross_validation import train_test_split as tts

mnist = fetch_mldata('MNIST original')
print("Data fetched.")

Xtr, Xts, Ytr, Yts = tts(mnist.data,
                         mnist.target,
                         test_size=10000)
print("tts done.")
clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(Xtr, Ytr)
print("fitted.")
predicted_label = clf.predict(Xts[-1])
## Module Constants
##########################################################################
##########################################################################
## Modules
##########################################################################
##########################################################################
## Program
##########################################################################

if __name__ == "__main__":

    corpus = load_files("Language_Folder")

    # print len(corpus.data)

    X_train, X_test, y_train, y_test = tts(corpus.data, corpus.target, test_size=0.20)

    text_clf = Pipeline([("vec", CountVectorizer(analyzer="char_wb")), ("clf", MultinomialNB())])

    text_clf = text_clf.fit(X_train, y_train)

    # Store the instance using pickle.
    with open("experiment_file", "w") as f:
        pickle.dump(text_clf, f)

    predicted = text_clf.predict(X_test)
    accuracy = np.mean(predicted == y_test)
    print accuracy

    print "Here it is."
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=5, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)

# Create the samplers
enn = EditedNearestNeighbours()
renn = RepeatedEditedNearestNeighbours()

# Create teh classifier
knn = KNN(1)


# Make the splits
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

# Add one transformers and two samplers in the pipeline object
pipeline = make_pipeline(pca, enn, renn, knn)

pipeline.fit(X_train, y_train)
y_hat = pipeline.predict(X_test)

print(classification_report(y_test, y_hat))
Exemplo n.º 40
0
#File is too big to run all at once, splits file into small files so I can run
#in chunks

import pandas as pd
from sklearn.cross_validation import train_test_split as tts

df=pd.read_csv('sdwis_clean.csv')
df=df.drop(['Unnamed: 0'], axis=1)

df,df1=tts(df,train_size=.9)
df,df2=tts(df, train_size=.89)
df,df3=tts(df, train_size=.88)
df,df4=tts(df, train_size=.86)
df,df5=tts(df, train_size=.83)
df,df6=tts(df, train_size=.8)
df,df7=tts(df, train_size=.75)
df,df8=tts(df, train_size=.66)
df10,df9=tts(df,train_size=.5)


pd.DataFrame.to_csv(df1,"df1.csv")
pd.DataFrame.to_csv(df2,"df2.csv")
pd.DataFrame.to_csv(df3,"df3.csv")
pd.DataFrame.to_csv(df4,"df4.csv")
pd.DataFrame.to_csv(df5,"df5.csv")
pd.DataFrame.to_csv(df6,"df6.csv")
pd.DataFrame.to_csv(df7,"df7.csv")
pd.DataFrame.to_csv(df8,"df8.csv")
pd.DataFrame.to_csv(df9,"df9.csv")
pd.DataFrame.to_csv(df10,"df10.csv")
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )

### add more features to features_list!
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

### your code goes here 

from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.cross_validation import train_test_split as tts

features_train, features_test, labels_train,labels_test = tts(features, labels, test_size=0.3, random_state=42)

print 'Baseline accuracy:',list(labels_test).count(0)/float(len(labels_test))
clf = DTC()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

print 'Predicted number of person\'s of interest',list(pred).count(1)
print('Accuracy:',accuracy_score(labels_test,pred))
print('Precision:',precision_score(labels_test,pred))
print('Recall:',recall_score(labels_test,pred))


Exemplo n.º 42
0
#
# x=sales15['margin_sum_15q1']
# y=sales15['sale_total_15']
# plt.scatter(x, y)
# plt.xlabel("Total Margin 2015 Q1")
# plt.ylabel("Total Sales 2015")
# plt.show()

#These variables from Q1 were all highly correlated with sales for the year,
#use them to predict.
#these variables are also correlated with each other, so it is redundant to use all
#However, for the sake of practicing a multvariable linear regression, well use them all
x=['sale_total_15q1','vol_sol_l_sum_15q1','margin_sum_15q1']

#Split data into test and train
train, test = tts(sales15, train_size=.85)
train_x=train[x]
train_y=train['sale_total_15']
test_x=test[x]
test_y=test['sale_total_15']

#Builds the model using the train data.
lm = linear_model.LinearRegression()
model = lm.fit(train_x, train_y)
predictions = lm.predict(test_x)
print "Sample:", lm.score(test_x, test_y)

#Builds the model with a Ridge Regularization
lm = linear_model.RidgeCV()
model = lm.fit(train_x, train_y)
predictions = lm.predict(test_x)
Exemplo n.º 43
0
	
linComb = LR()
linComb.fit(df_avg[['all_prev', 'all_avg']].values, df_avg['all_yr'].values)
print linComb.score(df_avg[['all_prev', 'all_avg']].values, df_avg['all_yr'].values)

linDet = LR()
linDet.fit(X,y)
print linDet.score(X,y)

# df_avg == 3 year rolling average + yr4 stats
X,y = df_avg[['all_avg']].values, df_avg['all_yr'].values
X,y = df_avg[['all_prev']].values, df_avg['all_yr'].values
X,y = df_avg[['all_avg', 'all_prev']].values, df_avg['all_yr'].values

X,y = df_avg[['1D_avg', '2D_avg', '3D_avg', 'all_avg','1D_prev', '2D_prev', '3D_prev', 'all_prev']].values, df_avg['all_yr'].values
X_train, X_test, y_train, y_test = tts(X, y)

lin = LR(fit_intercept=False)
lin.fit(X,y)
lin.score(X,y)

knn = KNR(n_neighbors=5)
knn.fit(X_train,y_train)
print knn.score(X_train,y_train)
print knn.score(X_test,y_test)


ns = range(1,30,2)
scores = []
for n in ns:
	knn = KNR(n_neighbors=n)
        X_test,y_test = X[test_idx,:],y[test_idx]
        plt.scatter(X_test[:,0],X_test[:,1],c='',
                    alpha=1.0,linewidth=1,marker='o',
                    s=55,label='test set')

    

if __name__=='__main__':

    iris = datasets.load_iris()
    X = iris.data[:,[2,3]]
    y = iris.target
    
    #spliting the data for test(30%) and training(70%) using tts 
    X_train,X_test,y_train, y_test = \
            tts(X,y,test_size=0.3, random_state=0)    


    #Standardising the feature (feature scaling) using ss 
    sc =ss()
    #Using fit to estimate 'sample mean','standard deviation' to do feature scaling 
    #for each feature dimension using training data 
    sc.fit(X_train)
    #tranform is used to standardize the trainig data (TrDS) and test data(TsDS)
    #Note: we have used same parameter for feature scaling 
    X_train_std = sc.transform(X_train)
    X_test_std  = sc.transform(X_test)


    #n_iter:-  Number of Epochs(passes over the TrDS set)
    #eta0/eta:-learning rate
Exemplo n.º 45
0
import numpy as np
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.decomposition import PCA as PCA
from sklearn.cross_validation import train_test_split as tts



datapath = 'G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/OttoProductClassification/Data/'
trainfile = 'train.csv'
testfile = 'test.csv'

trd = pd.read_csv(datapath+trainfile)
trd = trd.values

# Split data into training and cross-validation dataset
nptrd, npcvd = tts(trd,test_size=0.33)


# Train the model

pca = PCA(n_components=40)
pca.fit(nptrd[:,range(1,94)])
X = pca.transform(nptrd[:,range(1,94)])
PCAExplained = sum(pca.explained_variance_ratio_)

# Most of the features are highly skewed i.e. their 75% value ranges when we do td.describe() is 0 while their max is much higher. 
# This indicates that only a few values are non-zero for most features.
# This could mean that these features are actually categorical variables that are encoded in the test data.. could .. not sure

forest = rfc(n_estimators=500,criterion = 'entropy' , n_jobs=-1,min_samples_split=5,min_samples_leaf=5,max_depth=20)
#forest = forest.fit(nptrd[:,range(1,94)],nptrd[:,-1])
Exemplo n.º 46
0
from sklearn import metrics
from imblearn.over_sampling import ADASYN
from imblearn.ensemble import BalanceCascade
from imblearn.over_sampling import RandomOverSampler
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 


sm = SMOTE()
X_res, y_res = sm.fit_sample(X_train_tf.toarray(), datatrain['sentiment'])
X_res, y_res=sm.fit_sample(X_res,y_res)
print ('Data sentmen asli {}'.format (Counter(datatrain['sentiment'])))
print('Resampled dataset shape {}'.format(Counter(y_res)))

clf=svm.LinearSVC()
#clf = svm.SVC(decision_function_shape='ovo')
X_train, X_test, y_train, y_test = tts(X_res, y_res,test_size=0.2)
clf.fit(X_train,y_train)
predicted=clf.predict(X_test)
print(metrics.classification_report(y_test, predicted))
presisi_svm_smote=metrics.precision_score(y_test, predicted,average='macro')
recall_svm_smote=metrics.recall_score(y_test, predicted,average='macro')
f1_svm_smote=metrics.f1_score(y_test, predicted,average='macro')
akurasi_svm_smote=metrics.accuracy_score(y_test, predicted)
print "Presisi:",presisi_svm_smote 
print "Recall:", recall_svm_smote
print "F1-Score:", f1_svm_smote
print "Akurasi:", akurasi_svm_smote

Exemplo n.º 47
0
			else:
				df_tmp = pd.merge(df_tmp,df[(df.Year == y2) & (df.Team.isin(tms_include))][['Team','f']],how='left', on=['Team'])
				df_tmp.rename(columns={'f':'f_yr-%d' % (n)}, inplace=True)
	if df_tmp is not None:
		df_lag = df_lag.append(df_tmp[df_tmp.columns])
	
	
# Calculate changes
# df_lag['change'] = df_lag.yr2 - df_lag.yr1
# df_lag['abs_change'] = abs(df_lag.yr2 - df_lag.yr1)
# for c in df_lag.columns:
	# df_lag[c] = df_lag[c].astype(float)

Xcol = ['yr1_f','off_f','def_f','st_f','s_p','fei']
ycol = ['yr2_f']
X_train, X_test, y_train, y_test = tts(df_lag[Xcol].values, df_lag[ycol].values)

linreg = LR()
linreg.fit(X_train, y_train)
linreg.score(X_train, y_train)



# Train on all existing seasons to project 2014
X,y = df_lag[Xcol].values, df_lag[ycol].values
linreg = LR()
linreg.fit(X, y)
linreg.score(X, y)

# build 3yr avgs
df_3avg = pd.DataFrame(columns=['avg_f']+['off_f','def_f','st_f','s_p','fei','yr4_f'])
Exemplo n.º 48
0
f_Age = merged[merged.Sex==0]['Age'].median()
merged['age_fill'] = merged['Age']
merged.loc[merged.Age.isnull(),'age_fill'] = 27.5

#scale and fill NaN with mean
cols_to_scale = ['Fare','Pclass','Sex','age_fill','embarked_num','Parch','SibSp']
merged[cols_to_scale] = merged[cols_to_scale].fillna(merged[cols_to_scale].mean())

for i in range(len(cols_to_scale)):
    merged[[cols_to_scale[i]]] = pp.scale(merged[[cols_to_scale[i]]])

train = merged[:len(train)]
test = merged[len(train):]

#modeling with logit
xtrain,xval, ytrain,yval= tts(np.array(train[cols_to_scale]), np.ravel(train['Survived']))
LR = lm.LogisticRegression()
model = LR.fit(xtrain, ytrain)
score = model.score(xval,yval)
print('validation score: ',score)

xtest = np.array(test[cols_to_scale])
results = pd.DataFrame([test['PassengerId'], model.predict(xtest)], index = None).transpose()
results =results.rename(columns = {'Unnamed 0' : 'Survived'})

with open('./Submission.csv','w') as wfile:
    results.to_csv(wfile, index = False)
    wfile.close()


                             
import os
import numpy as np
import load_data

from sklearn import cross_validation as cv
from sklearn.cross_validation import train_test_split as tts
from sklearn.ensemble import ExtraTreesClassifier

for root, dirs, files in os.walk('data'):
    for name in files:
        if name.endswith('.csv'):
            print "Loading " + root + "/" + name
            dataset = load_data.load_data(name, root)

            splits = tts(dataset.data, dataset.target, test_size=0.2)
            X_train, X_test, y_train, y_test = splits

            # Build a forest and compute the feature importances
            forest = ExtraTreesClassifier(n_estimators=250)
            forest.fit(X_train, y_train)
            importances = forest.feature_importances_
            std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
            indices = np.argsort(importances)[::-1]

             # Print the feature ranking
            print("Feature ranking:")

            for f in range(X_train.shape[1]):
                print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
Exemplo n.º 50
0
le = LabelEncoder()
for i in range(0, 6):
    features[:, i] = le.fit_transform(features[:, i])
la = LabelEncoder()
features[:, 11] = la.fit_transform(features[:, 11])
#onehotencoder
'''before that we need to perform labelencoding in the columns of the dataframe'''

ohe = OneHotEncoder(categorical_features=[11])
features = ohe.fit_transform(features).toarray()

#labelencoding of the label
lc = LabelEncoder()
labels[:, 0] = lc.fit_transform(labels[:, 0])
from sklearn.cross_validation import train_test_split as tts

f_train, f_test, l_train, l_test = tts(features,
                                       labels,
                                       random_state=0,
                                       test_size=.20)
'''*****************************Now With Pandas***************************************'''
feature = df.drop("Target", axis=1)
for i in feature.select_dtypes(include=[object]):
    feature[i] = feature[i].astype('category').cat.codes

feature = pd.get_dummies(feature, columns=["Property_Area"])

label = df["Target"]
label = label.astype('category').cat.codes
label = pd.get_dummies(label)
Exemplo n.º 51
0
	
#Initialize the list to keep the scores from each iteration.
	OLS_score = []
	Ridge_score = []
	RidgeCV_score = []
	DecTree1_score = []
	DecTree2_score = []
	Lasso_score = []
	LassoCV_score = []
	RandomForest_score = []
	
		
# Obtain results for running the model a specified number of times
	for i in range(1,15):
#Train the data
		splits = tts(data, target, test_size=0.20)
		X_train, X_test, y_train, y_test = splits
	
#Run the OLS model.
		regr = linear_model.LinearRegression()
		regr.fit(X_train, y_train)
		OLS_score.append(regr.score(X_test, y_test))
		#print 'Coefficients OLS: \n', regr.coef_
		#print 'Intercept OLS: \n', regr.intercept_
		
#Run the Ridge model.
		clf = linear_model.Ridge(alpha=0.5)
		clf.fit(X_train, y_train)
		Ridge_score.append(clf.score(X_test, y_test))
		
#Run the RidgeCV model.
Exemplo n.º 52
0
#converting independent variables' values to positive
x1 = X
x2 = np.ones(shape=(np.size(X[:, 1]), np.size(X[1, :]))).astype(int)
X = np.add(x1, x2).astype(int)

#feature selection on the basis of chi squared test
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
select = SelectKBest(chi2, 42)
sel = select.fit(X, y)
feature_score = sel.scores_  #visualization of features' scores on the basis of chi2
X = sel.transform(X)

#for cross validation
from sklearn.cross_validation import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=2000, random_state=1)

#feature scaling
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit(X_train)
X_train = scale.transform(X_train)
X_test = scale.transform(X_test)

#while training on whole dataset, trained the whole dataset on the performance of svc
scale2 = StandardScaler()
scale2.fit(X)
X = scale2.transform(X)

#testing score of multi layered perceptron
from sklearn.neural_network import MLPClassifier as mlp
Exemplo n.º 53
0
#   -testing accuracy is a better estimate than training accuracy of out-of-sample performance
#   -but, it provides a high variance estimate since changing which observations happen to be in the testing
#    set can significantly change testing accuracy

from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split as tts
from sklearn.neighbors import KNeighborsClassifier as knn_
from sklearn import metrics 

# read in the iris data 
iris = load_iris()
X = iris.data
y = iris.target

# train/test split 
X_train, X_test, y_train, y_test = tts(X, y, random_state=4)

# check classification of KNN with K=5
knn = knn_(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)

# What if we created a bunch of train/test splits, calculated the accuracy for each,
# then averaged the results together?
# That's the essence of cross-validation

# Steps for K-fold cross-validation
# 1)  Split the data into K equal partitions (or "folds")
# 2)  Use fold 1 as the testing set and the union of the other folds as the training set
# 3)  Calculate testing accuracy