示例#1
0
def test_unigram_all():
    pos_origin_file = '../data/rule_word_origin_pos.txt'
    neg_origin_file = '../data/rule_word_origin_neg.txt'
    pos_stem_file = '../data/rule_word_stem_pos.txt'
    neg_stem_file = '../data/rule_word_stem_neg.txt'
    svm_origin_score = test_unigram(pos_origin_file, neg_origin_file, NuSVC())
    svm_stem_score = test_unigram(pos_stem_file, neg_stem_file, NuSVC())
    bnb_origin_score = test_unigram(pos_origin_file, neg_origin_file,
                                    BernoulliNB())
    bnb_stem_score = test_unigram(pos_stem_file, neg_stem_file, BernoulliNB())
    rfc_origin_score = test_unigram(pos_origin_file, neg_origin_file,
                                    RandomForestClassifier())
    rfc_stem_score = test_unigram(pos_stem_file, neg_stem_file,
                                  RandomForestClassifier())
    if not os.path.exists('../result'):
        os.mkdir('../result')
    with open('../result/unigram_result.txt', 'wt', encoding='utf-8') as f:
        f.write('original word result:\n')
        f.write('\t\t SVM: {0:.3f}%\n'.format(svm_origin_score * 100))
        f.write('\t\t BNB: {0:.3f}%\n'.format(bnb_origin_score * 100))
        f.write('\t\t RFC: {0:.3f}%\n'.format(rfc_origin_score * 100))
        f.write('\n stem result:\n')
        f.write('\t\t SVM: {0:.3f}%\n'.format(svm_stem_score * 100))
        f.write('\t\t BNB: {0:.3f}%\n'.format(bnb_stem_score * 100))
        f.write('\t\t RFC: {0:.3f}%\n'.format(rfc_stem_score * 100))
示例#2
0
    def data_feature_importance(self,
                                features_list,
                                title="Feature Importance"):

        from sklearn.preprocessing import LabelEncoder, Imputer
        from sklearn.cross_validation import train_test_split

        # extraemos las columnas con los features
        clf_data = self.dataframe.loc[:, features_list]

        # Preprocesaos los datos y los ajustamos
        cat_feats_to_use = list(clf_data.select_dtypes(include=object).columns)
        for feat in cat_feats_to_use:
            encoder = LabelEncoder()
            clf_data[feat] = encoder.fit_transform(clf_data[feat])

        # Llenamos los valores vacios
        num_feats_to_use = list(clf_data.select_dtypes(exclude=object).columns)
        for feat in num_feats_to_use:
            imputer = Imputer(strategy='median')
            clf_data[feat] = imputer.fit_transform(
                clf_data[feat].values.reshape(-1, 1))

        # Separamos el index de loas Fetures
        X = clf_data.iloc[:, 1:]
        y = clf_data.iloc[:, 0]  # the target were the first column I included

        # Entrenamos con los datos recivido
        x_train, _, y_train, y_test = train_test_split(X,
                                                       y,
                                                       test_size=.3,
                                                       random_state=35)

        from sklearn.preprocessing import StandardScaler

        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)

        from sklearn.ensemble.forest import RandomForestClassifier

        # inicializamos el clasificador
        clf = RandomForestClassifier(n_estimators=8, random_state=34)
        clf.fit(x_train, y_train)

        # Pasamos los datos a un Dataframe para poder graficarlos
        feats_imp = pd.DataFrame(clf.feature_importances_,
                                 index=X.columns,
                                 columns=['FeatureImportance'])
        feats_imp = feats_imp.sort_values('FeatureImportance', ascending=False)

        feats_imp.plot(kind='barh', figsize=(12, 6), legend=False)
        plt.title(title)
        sns.despine(left=True, bottom=True)
        plt.gca().invert_yaxis()

        plt.savefig(self.DefeaultPath + " feature importance.png", dpi=200)
        plt.cla()
        plt.clf()

        return
def plot_tree(profile, group, avg_acc, n_tree, picutre):
    '''
    选择最优的随机种子并建模
    :param profile: 丰度表
    :param group: 分组表
    :param avg_acc: 随机种子准确率的输出文件
    :param n_tree: 模型中树的颗数
    :param picutre: 输出图形的名称
    :return: None
    '''
    acc = pd.read_csv(avg_acc, sep='\t', header=0, index_col=0)
    best_state = int(acc.sort_values('avgAcc').index[-1])

    # 训练和保存预测模型
    rf = RandomForestClassifier(n_estimators=n_tree, max_leaf_nodes=3,
                                random_state=best_state)
    rf.fit(profile, group['label'])
    joblib.dump(rf, 'rf.pkl')

    # 绘制分类树结果图形
    dot = picutre.split('.')[0] + '.dot'
    tree_in_forest = rf.estimators_[rf.n_estimators-1]
    export_graphviz(tree_in_forest,
                    out_file=dot,
                    feature_names=profile.columns,
                    filled=True,
                    rounded=False,
                    precision=100)

    os.system('dot -Tpng {0} -o {1}'.format(dot, picutre))
示例#4
0
def estimate_weights_random_forests(X_s, X_t, X_w):

    X_all, all_labels = prepare_data_for_weights_estimation(X_s, X_t)
    # train logistic regression
    kf = KFold(X_all.shape[0], 10, shuffle=True)
    param_grid_rf = [{
        "n_estimators": np.array([500]),
        "max_depth": np.array([6]),
        # "max_features": np.array([1, 2, 4, 8, 16]),
        "min_samples_leaf": np.array([100])
    }]
    rf = GridSearchCV(RandomForestClassifier(50,
                                             max_depth=10,
                                             class_weight="auto",
                                             n_jobs=-1),
                      param_grid_rf,
                      cv=kf,
                      n_jobs=-1)
    rf = RandomForestClassifier(100,
                                max_depth=6,
                                min_samples_leaf=200,
                                class_weight="auto",
                                n_jobs=-1)
    rf.fit(X_all, all_labels)
    # print "best parameters for rf weights determination: ", rf.best_estimator_
    probas = rf.predict_proba(X_w)
    weights = probas[:, 1] / probas[:, 0]
    return weights
示例#5
0
 def mymap(data, N):
     data = cPickle.loads(str(data))
     x = data[:, :-1]
     y = data[:, -1]
     model = RandomForestClassifier(n_estimators=N, max_depth=6)
     model = model.fit(x, y)
     return cPickle.dumps(model)
示例#6
0
    def test_imdb_padded_valid(self):
        num_samples = 32
        num_words = 1024
        (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words,
                                                                 num_subsamples=num_samples)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int)
        x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1])

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)
    def __init__(self, initial_forrest_factor=5, n_estimators=10, **kwargs):
        self._initial_forrest_size = n_estimators * initial_forrest_factor
        self._final_forrest_size = n_estimators

        rf_fit_args = copy(kwargs)
        rf_fit_args.update({'n_estimators': self._initial_forrest_size})
        self._rf = RandomForestClassifier(**rf_fit_args)
示例#8
0
class MyRfClassifier(BaseClassifier):
    def __init__(self, n_estimators, max_depth, min_samples_leaf):
        self.classifier = RandomForestClassifier(
            **{
                'verbose': 1,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_leaf': min_samples_leaf,
                'n_jobs': 40
            })
        self.name = "rf_n{n}_md{md}_ms{ms}".format(**{
            "n": n_estimators,
            "md": max_depth,
            "ms": min_samples_leaf
        })

    def get_name(self):
        return self.name

    def fit(self, X, y, X_t, y_t):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts
class LexicaseForestClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, initial_forrest_factor=5, n_estimators=10, **kwargs):
        self._initial_forrest_size = n_estimators * initial_forrest_factor
        self._final_forrest_size = n_estimators

        rf_fit_args = copy(kwargs)
        rf_fit_args.update({'n_estimators': self._initial_forrest_size})
        self._rf = RandomForestClassifier(**rf_fit_args)

    def fit(self, X, y):
        self._rf.fit(X, y)

        for t in self._rf.estimators_:
            tree_y_pred = t.predict(X)
            t._error_vector = squared_error_vector(y, tree_y_pred)

        final_estimators = []
        for i in range(self._final_forrest_size):
            final_estimators.append(epsilon_lexicase_selection(self._rf.estimators_))

        self._rf.estimators_ = final_estimators
        self._rf.n_estimators = self._final_forrest_size
        # TODO: Set other self._rf parameters to match correct size so that predict works.

    def predict(self, X, y=None):
        return self._rf.predict(X)
示例#10
0
文件: Ba.py 项目: Emsibil/Bachelor
def enemy_detection_clf():

    chars = np.array(['warrior', 'warlock', 'mage', 'druid', 'rogue', 'shaman', 'paladin', 'priest', 'hunter'])
    data = []
    target = []
    for c in chars:
        p = path('images/character/new/black')
        for f in os.listdir(p+'/'+c):
            img = Image.open(p+'/'+c+'/'+f)
            w, h = img.size
            pixel = img.load()
            tmp = []
            for y in range(h):
                for x in range(w):
                    tmp.append(np.float(pixel[x,y] / 255))
            target.append(np.str(c))
            data.append(np.array(tmp))
    data = np.array(data)
    #image = data.view()
    #image.shape = (-1, 22, 30)
    #clf = svm.SVC(gamma = 0.001)
    clf = RandomForestClassifier()
    clf.fit(data, target)
    
    return clf
示例#11
0
def Random_Forest(x_train, Y_train,n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
                  min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, 
                  random_state=0, min_density=None):
    clf = RandomForestClassifier()
    clf.fit(x_train,Y_train)
    
    return clf
示例#12
0
def Random_Forest(x_train, Y_train,n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
                  min_samples_leaf=1, max_features='auto', bootstrap=True, oob_score=False, n_jobs=1, 
                  random_state=None, verbose=0, min_density=None, compute_importances=None, *args):
    clf = RandomForestClassifier()
    clf.fit(x_train,Y_train)
    
    return clf
示例#13
0
def training(processed_train_csv_file):
    processed_train_samples = pd.read_csv(processed_train_csv_file)
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    processed_train_samples = processed_train_samples.fillna(value=0)
    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    random.shuffle(processed_train_samples_index_lst)
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples["booking_bool"].values

    print "Training Random Forest Classifier"
    rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    rf_classifier.fit(features, labels)
    print "Saving the Random Forest Classifier"
    data_io.save_model(rf_classifier, model_name="rf_classifier.pkl")

    print "Training Gradient Boosting Classifier"
    gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    gb_classifier.fit(features, labels)
    print "Saving the Gradient Boosting Classifier"
    data_io.save_model(gb_classifier, model_name="gb_classifier.pkl")

    print "Training SGD Classifier"
    sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1)
    sgd_classifier.fit(features, labels)
    print "Saving the SGD Classifier"
    data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
示例#14
0
    def test_nlp_erroneous_rnn_args_invalid(self):
        num_words = 1024
        (x_train,
         y_train), (x_test,
                    y_test) = TestUtil.get_random_variable_length_dataset(
                        max_value=num_words)

        explained_model = RandomForestClassifier(n_estimators=64,
                                                 max_depth=5,
                                                 random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        with self.assertRaises(ValueError):
            _ = RNNModelBuilder(
                with_embedding=True,
                verbose=0)  # Must also specify the embedding_size argument.

        model_builder = RNNModelBuilder(embedding_size=num_words,
                                        with_embedding=True,
                                        verbose=0)

        input_layer = Input(shape=(10, 2))
        with self.assertRaises(ValueError):
            model_builder.build(input_layer)

        input_layer = Input(shape=(10, 3))
        with self.assertRaises(ValueError):
            model_builder.build(input_layer)
示例#15
0
文件: rf.py 项目: makeling/antares
 def __init__(self, path):
     '''
     Constructor
     '''
     self.path = path
     self.model = RandomForestClassifier(n_estimators=150,n_jobs=8)
     self.model_name = 'rf'
    def test_RandomForest(self):
        X = [[0, 1], [1, 1]]
        Y = [0, 1]

        regression = RandomForestClassifier(n_estimators=10)
        regression = regression.fit(X, Y)
        regression.predict_proba(X)
示例#17
0
def importance():
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features3.csv')

    features = list(df.columns.values)
    
    target = 'm_column_result'
    features.remove('m_match_id')
    features.remove('m_column_result')
    features.remove('m_match_date')
    features.remove('m_goals_home')
    features.remove('m_goals_away')
    features.remove('a_next_match_id')
    features.remove('h_next_match_id')
    features.remove('m_favorite')
    features.remove('m_medium')
    features.remove('m_underdog')
    features.remove('h_last_match_local')
    features.remove('a_last_match_local')
    features.remove('rf1000')
    features.remove('rf1000_fs1')
    
    
    X = df[features]
    y = df[target]
    # fit an Extra Trees model to the data
    clf = RandomForestClassifier(n_estimators=1000)
    clf.fit(X, y)
    # display the relative importance of each attribute
    for x,y in zip(features,clf.feature_importances_):
        
        print (x,y)
示例#18
0
def stkFoldCrossValidation():

    X = pickle.load(open('X.p', 'rb'))

    X = np.array(X)

    Y = pickle.load(open('Y.p', 'rb'))

    Y = np.array(Y)

    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(X, Y)

    k = 1
    for train_index, test_index in skf.split(X, Y):

        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        print(k)
        k += 1

        rf = RandomForestClassifier()

        rf.fit(X_train, Y_train)

        yp = rf.predict(X_test)
        print(classification_report(Y_test, yp, digits=6))
示例#19
0
文件: models.py 项目: kalpanki/pp
def rforest_classify(X,Y):
	#clf = RandomForestClassifier(criterion='gini',max_features=7,n_estimators=100,n_jobs=3,min_samples_leaf=5)

	clf = RandomForestClassifier(n_estimators=500, \
			criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1 \
                ,max_features='auto', bootstrap=False, oob_score=False, n_jobs=-1, min_density=None)
	clf.fit(X,Y)
	return clf
示例#20
0
def RF_Features_Importance(X, Y, outputfile="RF.csv"):
    forest = RandomForestClassifier(n_estimators=300)
    forest.fit(X, Y)
    importances = np.matrix(forest.feature_importances_).tolist()[0]
    df = pd.DataFrame(list(zip(header, importances)),
                      columns=["Features", "Importance"])

    df.to_csv(outputfile, index=False)
示例#21
0
def forest(X, y, model_path):
    model = RandomForestClassifier()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
 def __init__(self, sig_weight=1., pow_sig=1., pow_bg=1., n_estimators=10,
              criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features="auto",
              bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None,
              compute_importances=None):
     RandomForestClassifier.__init__(self)
     # Everything should be set via set_params
     self.sig_weight = sig_weight
     self.pow_bg = pow_bg
     self.pow_sig = pow_sig
示例#23
0
def calcRandomForestClassifier(channels_training, channels_testing,
                               target_training, target_testing):
    clf = RandomForestClassifier(n_estimators=500,
                                 max_features=int(
                                     sqrt(len(channels_training[0]))))
    clf = clf.fit(channels_training, target_training)
    predictions = clf.predict(channels_testing)
    comp = [predictions, target_testing, channels_testing]
    return clf, comp
示例#24
0
def calc_score(test, train):
    test_f, test_l = split_data_label(test)
    train_f, train_l = split_data_label(train)
    # 학습시키고 정답률 구하기

    clf = RandomForestClassifier()
    clf.fit(train_f, train_l)
    pre = clf.predict(test_f)
    return metrics.accuracy_score(test_l, pre)
def train_rf(train_vec, train_label):
    from sklearn.ensemble.forest import RandomForestClassifier as RFC
    # rfrclf = RFR(n_estimators=1001)
    # rfrclf.fit(train_vec, train_label)
    # print rfrclf.feature_importances_
    trfclf = RFC(n_estimators=1001)
    trfclf.fit(train_vec, train_label)
    # print rfclf.feature_importances_
    return trfclf
示例#26
0
def drawfeature(train_data_path,train_file_name,test_data_path,test_file_name):
    train_file = os.path.join(train_data_path,train_file_name)
    train_data = pd.read_csv(train_file)
    n_data_train = train_data['text'].size
    print 'n_data_train is %s' %n_data_train
    print type(n_data_train)
    
    test_file = os.path.join(test_data_path,test_file_name)
    test_data = pd.read_csv(test_file)
    n_data_test = test_data['text'].size
    print 'n_data_test is %s' %n_data_test
    print type(n_data_test)
    
    vectorizer = CountVectorizer(analyzer='word',tokenizer = None,
        preprocessor = None, stop_words=None, max_features = 5000)
    transformer = TfidfTransformer()
    
    train_data_words = []
    
    print 'start with words in train data set'
    for i in xrange(n_data_train):
        if((i+1)%1000 == 0):
            print 'Drawfeatures line %d of %d' %(i+1,n_data_train)
        train_data_words.append(words_to_features(train_data['text'][i]))
    print 'start bag of words in train data....'
    train_data_features = vectorizer.fit_transform(train_data_words)
    train_data_features = train_data_features.toarray()
    print 'start tfidf in train data....'
    train_data_features = transformer.fit_transform(train_data_features)
    train_data_features = train_data_features.toarray()
    #test-data processing
    test_data_words = []
    for i in xrange(n_data_test):
        if((i+1)%1000 == 0):
            print 'Drawfeatures line %d of %d' %(i+1,n_data_test)
        test_data_words.append(words_to_features(test_data['text'][i]))
    
    test_data_features = vectorizer.fit_transform(test_data_words)
    test_data_features = test_data_features.toarray()
    
    
       
    print'randome forest go...'
    forest = RandomForestClassifier(n_estimators = 13)
    forest = forest.fit(train_data_features,train_data['label'])
    pred = forest.predict(test_data_features)
    pred = pd.Series(pred,name='Target')
    pred.to_csv('SENTI_RF.CSV',index=None, header = None)

    
    print'naive baby go...'
    mnb = MultinomialNB(alpha=0.01)
    mnb = mnb.fit(train_data_features,train_data['label'])
    pred = mnb.predict(test_data_features)
    pred = pd.Series(pred,name = 'Target')
    pred.to_csv('SENTI_MNB',index = None, header = True)
示例#27
0
def main(args):

    if args.analyse != None:
        train_data_x, test_data_x,train_data_y, test_data_y  = process_data(args.analyse)

        RT = RandomForestClassifier(n_estimators=100)
        RT.fit(train_data_x, train_data_y)
        print RT.score(test_data_x, test_data_y)

    return
示例#28
0
文件: RF.py 项目: eenx15/Volvo-DataX
def random_forest_classifier(features, target):
    """
    To train the random forest classifier with features and target data
    :param features:
    :param target:
    :return: trained random forest classifier
    """
    clf = RandomForestClassifier(n_estimators=600, max_depth=50)
    clf.fit(features, target)
    return clf
示例#29
0
文件: Ba.py 项目: Emsibil/Bachelor
def my_digits():
    digits = _data()
    
    n_samples = len(digits.images)
    datas = digits.images.reshape((n_samples, -1))

    classifier = RandomForestClassifier()
    classifier.fit(datas, digits.target)
    
    return classifier
示例#30
0
def test_unigram_all():
    #    test_unigram(NuSVC(), 'SVM', '../data/pos_origin.txt', '../data/neg_origin.txt', 'origin')
    #    test_unigram(NuSVC(), 'SVM', '../data/pos_stem.txt', '../data/neg_stem.txt', 'stem')
    test_unigram(BernoulliNB(), 'BNB', '../data/pos_origin.txt',
                 '../data/neg_origin.txt', 'origin')
    test_unigram(BernoulliNB(), 'BNB', '../data/pos_stem.txt',
                 '../data/neg_stem.txt', 'stem')
    test_unigram(RandomForestClassifier(), 'RFC', '../data/pos_origin.txt',
                 '../data/neg_origin.txt', 'origin')
    test_unigram(RandomForestClassifier(), 'RFC', '../data/pos_stem.txt',
                 '../data/neg_stem.txt', 'stem')
 def RandomForestClassifer(self):
     
     '''
     Function to do RandomForest Classifer.
     '''
     train_Array = self.titanic_train_frame.values
     self.test_Array = self.titanic_test_frame.values
     randomForest = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
     randomForest.fit(train_Array[0::,1::],train_Array[0::,0])
     self.predicted_probability = randomForest.predict(self.test_Array[0::,0::])
     self.predicted_probability_list = self.predicted_probability.tolist()
示例#32
0
class RFClassifier(super.abstract_classifier):

    def __init__(self, train_features, train_labels, num_of_trees):
        self.train_features = train_features
        self.train_labels = train_labels
        self.rf_member = RandomForestClassifier(num_of_trees)

    def train(self):
        self.rf_member.fit(self.train_features, self.train_labels)

    def classify(self, newVector):
        return self.rf_member.predict(newVector)
示例#33
0
def evalOne(enabledColumns):
    features = [all_features[i] for i in range(0, len(all_features)) if enabledColumns[i]]
    Y = []
    P = []
    for group in range(0,5):
    #     print("Test group " + str(group + 1))
        trainStationList = []
        testStationList = []
        for i in range(0,5):
            if i == group:
                testStationList.extend(groups[i])
            else:
                trainStationList.extend(groups[i])
        trainStations = set(float(station) for station in trainStationList)
        # reorder train stations
    #     print("\ttrainStationList:" + str(trainStationList))
        trainStationList = [s for s in all_stations if float(s) in trainStations]
    #     print("\ttrainStationList:" + str(trainStationList))
        testStations = set(float(station) for station in testStationList)
    #     print("\ttestStationList:" + str(testStationList))
        trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation(trainStations, testStations, "location", data, features, "target")
     
        train_lower = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i < (len(trainStationList) / 2.0)]
#         train_upper = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i >= (len(trainStationList) / 2.0)]
         
        test_lower = [float(testStationList[i]) for i in range(0, len(testStationList)) if i < (len(testStationList) / 2.0)]
#         test_upper = [float(testStationList[i]) for i in range(0, len(testStationList)) if i >= (len(testStationList) / 2.0)]
         
        trainY = []
        for l in trainLocation:
            if l in train_lower:
                trainY.append(0)
            else:
                trainY.append(1)
         
        testY = []
        for l in testLocation:
            if l in test_lower:
                testY.append(0)
            else:
                testY.append(1)
         
        model = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=9, n_jobs=-1)
        model.fit(trainX, trainY)
        predY = model.predict(testX)
         
        Y.extend(testY)
        P.extend(predY)
     
    f1 = f1_score(Y, P)
    accuracy = accuracy_score(Y, P)
    return f1, accuracy
示例#34
0
def do_training(processed_train_csv_file):
    ## Processed train samples reading
    # read saved processed train samples from the given csv file
    processed_train_samples = pd.read_csv(processed_train_csv_file)

    # inf to nan
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    # nan to 0
    processed_train_samples = processed_train_samples.fillna(value=0)

    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    # 之前排过序,这里shuffle一下,效果更好
    random.shuffle(processed_train_samples_index_lst)

    # organize new train samples and targets
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples['booking_bool'].values

    # Model training
    # 1 Random Forest Classifier

    print("Training Random Forest Classifier")
    rf_classifier = RandomForestClassifier(n_estimators=150,
                                           verbose=2,
                                           n_jobs=-1,
                                           min_samples_split=10)
    rf_classifier.fit(features, labels)

    print("Saving the Random Forest Classifier")
    data_io.save_model(rf_classifier, model_name='rf_classifier.pkl')

    # 2 Gradient Boosting Classifier
    print("Gradient Boosting  Classifier")
    gb_classifier = GradientBoostingClassifier(n_estimators=150,
                                               verbose=2,
                                               learning_rate=0.1,
                                               min_samples_split=10)
    gb_classifier.fit(features, labels)
    print("Saving the Gradient Boosting  Classifier")
    data_io.save_model(gb_classifier, model_name='gb_classifier.pkl')

    # 3 SGD Classifier
    print("SGD Classifier")
    sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2,
                                   n_jobs=-1)
    sgd_classifier.fit(features, labels)

    print("saved the SGD Classifier")
    data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
示例#35
0
def train_classifier(vocal_frames, non_vocal_frames):

    frames = np.append(vocal_frames, non_vocal_frames, axis=0)

    labels_vocal = np.ones(vocal_frames.shape[0])
    labels_non_vocal = np.zeros(non_vocal_frames.shape[0])

    labels = np.append(labels_vocal, labels_non_vocal, axis=0)

    rfc = RandomForestClassifier(n_estimators=100, max_depth=None)
    rfc.fit(frames, labels)

    return rfc
示例#36
0
def RF(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelRF=RandomForestClassifier(n_estimators=10,
                                    max_depth=5,max_features=1,random_state=0)
     modelRF.fit(train_desc,np.array(train_labels))
     joblib.dump((modelRF, img_classes, stdSlr), pth+"/rf-bof.pkl", compress=3) 
     test(pth, "rf-")
示例#37
0
    def train(self, log_probs, labels):
        '''
        Train the stacked classifier
        @param log_probs: list of probability matrices (channels, label)
        @param labels: label for each feature-vector
        '''
        print 'Train stacked classifier with %d windows' % labels.shape[0]
        log_probs = [self.channel_sort.sort(f) for f in log_probs]
        log_probs = np.vstack(log_probs)

        # TODO: classifier as Parameter
        self.stacked_classifier = RandomForestClassifier(n_estimators=10)
        self.stacked_classifier.fit(log_probs, labels)
示例#38
0
def train_model(X_train,y_train):
    print("training the model ...")
    
    # create sets for probability calibration
    X_train_train, X_prob_cal, y_train_train, y_prob_cal = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.2)

    
    rf = RandomForestClassifier(
               max_features="auto",
               n_estimators=2000,
               max_depth=8,
               n_jobs=-1,
               class_weight = 'balanced',
               verbose=1)
    rf.fit(X_train_train,y_train_train)
    
    # feature importances
   
#    feature_importance = False
#    if(feature_importance):
#        
#        importances = rf.feature_importances_
#        std = np.std([tree.feature_importances_ for tree in rf.estimators_],
#                 axis=0)
#        indices = np.argsort(importances)[::-1]
#        col_names = df.drop('bin',axis=1).columns.values
#        print("Feature ranking:")
#        
#        for f in range(X_train_train.shape[1]):
#            print("%d. %s (%f)" % (f + 1, col_names[indices[f]], importances[indices[f]]))
#        
#        # Plot the feature importances of the forest
#        plt.figure()
#        plt.title("Feature importances")
#        plt.bar(range(X_train_train.shape[1]), importances[indices],
#               color="r", yerr=std[indices], align="center")
#        plt.xticks(range(X_train_train.shape[1]), col_names[indices],rotation = 50)
#        plt.xlim([-1, X_train_train.shape[1]])
#        plt.show()
        
    
    # Probability calibration
    sig_clf = CalibratedClassifierCV(rf, method="sigmoid", cv="prefit")
    sig_clf.fit(X_prob_cal, y_prob_cal)
    y_pred_train = sig_clf.predict_proba(X_train)
    
    
    print(".. training log_loss  : {:0.2f} %".format(log_loss(y_train,y_pred_train)*100))
    return sig_clf
示例#39
0
class Model(BaseModel):
    """Antares implementation of scikit learn random forest classifier

    """
    def __init__(self,
                 categorical_features=None,
                 n_estimators=50,
                 n_jobs=-1,
                 max_depth=10):
        '''
        Example:
            >>> from madmex.modeling.supervised.rf import Model
            >>> rf = Model()
            >>> # Write model to db
            >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no')
            >>> # Read model from db
            >>> rf2 = Model.from_db('test_model')
        '''
        super().__init__(categorical_features=categorical_features)
        self.model = RandomForestClassifier(n_estimators=n_estimators,
                                            n_jobs=n_jobs,
                                            max_depth=max_depth)
        self.model_name = 'rf'

    def fit(self, X, y):
        X = self.hot_encode_training(X)
        self.model.fit(X, y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        X = self.hot_encode_predict(X)
        return self.model.predict(X)

    def predict_confidence(self, X):
        """Get confidence of every prediction
        """
        X = self.hot_encode_predict(X)
        return self.model.predict_proba(X).max(axis=1)

    def score(self, X, y):
        '''
        Test the model given a dataset and a target vector.

        This method applies the model that this object represents to the given dataset using
        the response variable y. It is a measure of the accuracy of the trained model. Usually
        the orginal dataset should be splitted in training and testing subsets to cross validate
        the model.
        '''
        return self.model.score(X, y)
示例#40
0
def classic_model(image_dir, image_lists, method):

    X, y = get_X_y(image_dir, image_lists, ['training', 'validation'], method)
    classifier = RandomForestClassifier(n_estimators=1000, n_jobs=4)
    classifier.fit(X, y)

    X_test, y_test = get_X_y(image_dir, image_lists, ['testing'], method)
    predictions = classifier.predict(X_test)
    confusion = pandas.crosstab(y_test,
                                predictions,
                                rownames=['Actual Class'],
                                colnames=['Predicted Class'])
    print confusion
    return accuracy_score(y_test, predictions)
示例#41
0
 def __init__(self, n_estimators, max_depth, min_samples_leaf):
     self.classifier = RandomForestClassifier(
         **{
             'verbose': 1,
             'n_estimators': n_estimators,
             'max_depth': max_depth,
             'min_samples_leaf': min_samples_leaf,
             'n_jobs': 40
         })
     self.name = "rf_n{n}_md{md}_ms{ms}".format(**{
         "n": n_estimators,
         "md": max_depth,
         "ms": min_samples_leaf
     })
def try_model(train):
    print(train.shape)
    features = ["phone_brand", "device_model",  "event_count", "action_radius_max", "medianTime", "minTime", "maxTime", "weekday", "appcounts1"]
    encoder = LabelEncoder()
    train["group"] = encoder.fit_transform(train["group"].values)
    
    rf = RandomForestClassifier(n_estimators=50, max_depth=15, max_features=6, bootstrap=True, n_jobs=4, random_state=2016, class_weight=None)
    
    rf.fit(train[features].values, train["group"].values)
    feature_importance(rf, features)
    
    skf = StratifiedKFold(train["group"].values, n_folds=5, shuffle=True, random_state=2016)
    scores = cross_val_score(rf, train[features].values, train["group"].values, scoring="log_loss", cv=skf, n_jobs=1)
    print(scores)
    print("RF Score: %0.5f" %(-scores.mean())) # RF Score: 2.39884
示例#43
0
def just_pred(x, y):
    xlen = len(x)
    i = range(xlen)
    np.random.shuffle(i)
    trainpct = 0.7
    trainlen = int(trainpct * xlen)
    testlen = xlen - trainlen
    xtrain = x.ix[:trainlen,:]
    ytrain = y.ix[:trainlen]
    xtest = x.ix[trainlen:,:]
    ytest = y.ix[trainlen:]
    rf = RandomForestClassifier()
    rf.fit(xtrain, ytrain)
    ypred = rf.predict(xtest)
    return ytest, ypred
示例#44
0
def crossval(x, y, k=5):
    for i in range(k):
        i = range(len(X))
        np.random.shuffle(i)
        xlen = len(x)
        trainpct = 0.7
        trainlen = int(trainpct * xlen)
        testlen = xlen - trainlen
        xtrain = x.ix[:trainlen,:]
        ytrain = y.ix[:trainlen]
        xtest = x.ix[trainlen:,:]
        ytest = y.ix[trainlen:]
        rf = RandomForestClassifier()
        rf.fit(xtrain, ytrain)
        ypred = rf.predict(xtest)
        print ypred
示例#45
0
 def __init__(self, n_estimators, max_depth, min_samples_leaf):
     self.classifier = RandomForestClassifier(**{'verbose':1, 'n_estimators': n_estimators,
                                                 'max_depth':max_depth,'min_samples_leaf':min_samples_leaf,
                                                 'n_jobs':40})
     self.name = "rf_n{n}_md{md}_ms{ms}".format(
         **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
     )
示例#46
0
 def __init__(self, verbose=1, n_estimators = 2000, max_depth=8, min_samples_leaf=10000,
              n_jobs=25):
     self.classifier = RandomForestClassifier( **{'verbose': verbose,
                                                  'n_estimators': n_estimators,
                                                  'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf,
                                                   'n_jobs': n_jobs})
     self.name = "rf_n{n}_md{md}_ms{ms}".format(
         **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
     )
示例#47
0
class MyRfClassifier(BaseClassifier):
    def __init__(self, n_estimators, max_depth, min_samples_leaf):
        self.classifier = RandomForestClassifier(**{'verbose':1, 'n_estimators': n_estimators,
                                                    'max_depth':max_depth,'min_samples_leaf':min_samples_leaf,
                                                    'n_jobs':40})
        self.name = "rf_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )
    def get_name(self):
        return self.name

    def fit(self, X, y, X_t, y_t):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts
示例#48
0
class MyRandomForestClassifier(BaseClassifier):
    def __init__(self, verbose=1, n_estimators = 2000, max_depth=8, min_samples_leaf=10000,
                 n_jobs=25):
        self.classifier = RandomForestClassifier( **{'verbose': verbose,
                                                     'n_estimators': n_estimators,
                                                     'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf,
                                                      'n_jobs': n_jobs})
        self.name = "rf_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )

    def get_name(self):
        return self.name

    def fit(self, X, y):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        return self.classifier.feature_importances_
示例#49
0
 def initDecTrees(self, path):
     for filename in os.listdir(path):
         if filename=='train.csv':
             with open(os.path.join(path,filename)) as infile:
                 f = csv.reader(infile)
                 aux = f.next()  # skip the header
                 x = []
                 y = []
                 for line in f:
                     if size(line) > 1:
                         if self.option == 1:
                             data = [converter(line[2]), converter(line[3]), converter(line[4]), converter(line[7]), converter(line[9])]
                             y.append(converter(line[6]))
                             x.append(data)
                         elif self.option == 2:
                             auxDeputy = fetchDeputyParty(line[2])
                             data = [converter(line[2]), converter(line[3]), converter(line[4]), converter(line[7]), converter(line[9]), encodeParty(auxDeputy['party']), encodeState(auxDeputy['state'])]
                             y.append(converter(line[6]))
                             x.append(data)
             clf = RandomForestClassifier(n_estimators=5)
             clf.fit(x, y)
             return clf
示例#50
0
def drawfeature(train_data_path='./train', train_filename='train_cleaned',test_data_path='./test', test_filename='test_cleaned'):
    train_file = os.path.join(train_data_path, train_filename)
    train_data = pd.read_csv(train_file)
    n_train_data = train_data['text'].size

    test_file = os.path.join(test_data_path,test_filename)
    test_data = pd.read_csv(test_file)
    n_test_data = test_data['text'].size

    vectorizer = CountVectorizer(analyzer="word",tokenizer=None, preprocessor=None, stop_words=None, max_features=2000)
    transformer = TfidfTransformer()

    train_data_words = []
    for i in xrange(n_train_data):
        train_data_words.append(words_to_features(train_data['text'][i]))
    train_data_features = vectorizer.fit_transform(train_data_words)
    train_data_features = train_data_features.toarray()
    train_data_features = transformer.fit_transform(train_data_features)
    train_data_features = train_data_features.toarray()
    train_data_pd=pd.Series(train_data_features,name=None)
    train_data_pd.to_csv("trainfeature.csv", index=None, header=True)


    test_data_words = []
    for i in xrange(n_test_data):
        test_data_words.append(words_to_features(test_data['text'][i]))
    test_data_features = vectorizer.fit_transform(test_data_words)
    test_data_features = test_data_features.toarray()
    test_data_features = transformer.fit_transform(test_data_features)
    test_data_features = test_data_features.toarray()
    test_data_pd=pd.Series(test_data_features,name=None)
    test_data_pd.to_csv("testfeature.csv", index=None, header=True)

    forest = RandomForestClassifier(n_estimators=60)
    forest = forest.fit(train_data_features, train_data['lable'])
    pred = forest.pedict(test_data_features)
    pred = pd.Series(pred,name='Target')
    pred.to_csv("bow_tfidf_RF.csv", index=None, header=True)
示例#51
0
audit_y = audit_y.astype(int)

print(audit_X.dtype, audit_y.dtype)

def predict_audit(classifier):
    adjusted = DataFrame(classifier.predict(audit_X), columns = ["Adjusted"])
    adjusted_proba = DataFrame(classifier.predict_proba(audit_X), columns = ["probability_0", "probability_1"])
    return pandas.concat((adjusted, adjusted_proba), axis = 1)

audit_tree = DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5)
audit_tree.fit(audit_X, audit_y)

store_pkl(audit_tree, "DecisionTreeAudit.pkl")
store_csv(predict_audit(audit_tree), "DecisionTreeAudit.csv")

audit_forest = RandomForestClassifier(random_state = 13, min_samples_leaf = 5)
audit_forest.fit(audit_X, audit_y)

store_pkl(audit_forest, "RandomForestAudit.pkl")
store_csv(predict_audit(audit_forest), "RandomForestAudit.csv")

audit_regression = LogisticRegression()
audit_regression.fit(audit_X, audit_y)

store_pkl(audit_regression, "RegressionAudit.pkl")
store_csv(predict_audit(audit_regression), "RegressionAudit.csv")

#
# Multi-class classification
#
                                                    test_size=0.25,
                                                    random_state=666)

train_ind = X_train.index.values
test_ind = X_test.index.values

g_train = g.iloc[train_ind,:]
g_test = g.iloc[test_ind,:]


clf = tree.DecisionTreeClassifier(criterion='gini', 
                                  max_depth=6, 
                                  min_samples_leaf=3)
####################
clf = RandomForestClassifier(criterion='gini', 
                                  max_depth = 6, 
                                  min_samples_leaf=3,
                                  n_estimators=50)
####################
clf = AdaBoostClassifier(DecisionTreeClassifier(criterion='gini',
                                                max_depth=6, 
                                                min_samples_leaf=3),
                         n_estimators = 200,
                         learning_rate = 0.1)
####################
clf = neighbors.KNeighborsClassifier(100, weights='uniform')
clf = neighbors.KNeighborsClassifier(100, weights='distance')
####################
clf = GaussianNB()
##############################
t0 = time()
param_grid = {'C': [150, 500, 750, 1000],
### the training data (features_train, labels_train) have both "fast" and "slow"
### points mixed together--separate them so we can give them different colors
### in the scatterplot and identify them visually
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]


#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")

### your code here!  name your classifier object clf if you want the 
clf=RandomForestClassifier()
clf.fit(features_train,labels_train)
### draw the decision boundary with the text points overlaid
prettyPicture(clf, features_test, labels_test)

plt.show()
################################################################################




示例#54
0
def runns(resp_var, size_of_test_data,dataset,positive_class,n_estimators,important_features,dealing_with_nulls):
	dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes
	#----DATA PREPROCESSING
	#-------dealing with NULL values in the data
	#----------remove the rows in which the response is null
	dataset=dataset.dropna(subset=[resp_var])
	#----------dealing with nulls
	dataset=deal_with_nulls(dealing_with_nulls,dataset)
	#----FEATURE SELECTION
	#-------get predictors important in predicting the response
	#-----------transform categorical predictors to dummy variables
	predictors=dataset.drop(resp_var,axis=1,inplace=False)
	predictors=pd.get_dummies(predictors)
	#-----------balance the classes in the response var
	ros = RandomOverSampler(random_state=0)
	resp=dataset[resp_var]
	prds, resp = ros.fit_sample(predictors, resp)
	#-----------fit the random forest classifier to give us the important predictors
	rf_clf = RandomForestClassifier(n_estimators=n_estimators)
	rf_clf.fit(prds,resp)
	#-------get the important predictors
	feature_imp = pd.Series(rf_clf.feature_importances_,
                    index=list(predictors.iloc[:,0:])).sort_values(ascending=False)
	#-------names of the important predictors
	important_predictor_names = feature_imp.index[0:important_features]
	#-------subset the data to get only the important predictors and the response
	resp=pd.DataFrame(data=resp,columns=[resp_var])
	predictors=pd.DataFrame(prds,columns=list(predictors))
	dataset=pd.concat([resp,predictors],axis=1)
	#---------------------------------------------------------
	#----MODEL TRAINING
	#--------Remove the response variables from the features variables - axis 1 refers to the columns
	m_data= dataset.drop(resp_var, axis = 1,inplace=False) 
	# Response variables are the values we want to predict
	resp_var = np.array(dataset[resp_var])

	dataset = pd.get_dummies(m_data)
    
	# Saving feature names for later use
	feature_list = list(m_data.columns)
	# Convert to numpy array
	dataset = np.array(dataset)

	# Split the data into training and testing sets
	train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = size_of_test_data, random_state = 402)

	# Instantiate model with n_estimators decision trees
	clf = SVC(kernel='rbf',probability=True)

	# Train the model on training data
	clf.fit(train_features, train_labels)
    # evaluation
	predicted = clf.predict(test_features)
	pred_prob = clf.predict_proba(test_features)
    
	accuracy = accuracy_score(test_labels, predicted)
	#confusion matrix
	cnf = (confusion_matrix(test_labels,predicted))
	#precision score
	precision = precision_score(test_labels,predicted,pos_label=positive_class)
	#avg pres
	avg_precision = average_precision_score(test_labels,pred_prob[:,[1]])
	#recall score
	rec = recall_score(test_labels,predicted,pos_label=positive_class)
	#f1 scorea
	fscore = f1_score(test_labels,predicted,pos_label=positive_class)
	#fbeta score
	fbeta = fbeta_score(test_labels,predicted,beta=0.5)
	#hamming_loss
	hamming = hamming_loss(test_labels,predicted)
	#jaccard similarity score
	jaccard = jaccard_similarity_score(test_labels,predicted)
	#logloss
	logloss = log_loss(test_labels,predicted)
	#zero-oneloss
	zero_one = zero_one_loss(test_labels,predicted)
	#auc roc 
	area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]])
	#cohen_score
	cohen = cohen_kappa_score(test_labels,predicted)
	#mathews corr
	mathews = matthews_corrcoef(test_labels,predicted)
	# Variable importances from the important features selection stage
	variable_importance_list = list(zip(prds, feature_imp))
	output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews}
	output=json.dumps(output)
	return jsonify({"Predictions": output})
                #'Date',
                'WnvPresent_DateTrapSpecies'], axis=1)
    

    # Create dfs based on mask    
    X_train = X[~msk]
    X_test = X[msk]
    y_train = y[~msk]
    y_test = y[msk]

    return X_train, X_test, y_train, y_test


# Create classifiers

clf = RandomForestClassifier(n_estimators=500,
                            min_samples_leaf=5)

clf = xgbwrapper.XgbWrapper({'objective': 'binary:logistic',
                             'eval_metric': 'auc',
                             'eta': 0.05,
                             'silent': 1})


# Cross validation
if do_cross_val:

    # Leave-one-year-out cross-validation
    scores = []
    total_pred = np.array([])
    total_test = np.array([])
    
示例#56
0
文件: model2.py 项目: tearf001/ucloud
def model_pred(trainX,trainY,testX,model_type):
    if model_type == "rf":
        clf = RandomForestClassifier(n_estimators = 500,n_jobs = 20)
        clf.fit(trainX,trainY)
        pred = clf.predict(testX)
    if model_type == "gbdt":
        clf = GradientBoostingClassifier(n_estimators=6,learning_rate=0.9,random_state=0)
        clf.fit(trainX,trainY)
        pred = clf.predict(testX)
    if model_type == "fusion":
        prob = np.zeros(len(testX))
        params = [100,200,300,400,500]
        for param in params:
            clf = RandomForestClassifier(n_estimators = param,n_jobs = 20,bootstrap=True)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        params = [1,2,3,4,5,6,7,8,9,10]
        for param in params:
            clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        pred = list(prob >= 3)
    print "the pos rate is:",float(sum(pred))/len(pred)
    return pred
示例#57
0
                    if 'FIGURES' in line:
                        break
                    vals = line.strip().split('\t')
                    text = vals[2]

                    corpus_test.append(text)
                    if int(vals[0]) == 0:
                        y_test.append('0')
                    else:
                        y_test.append('1')
    
    X_train = vectorizer.fit_transform(corpus_train)

    X_test = vectorizer.transform(corpus_test)
    
    clf = RandomForestClassifier(n_estimators=10)
    #clf = KNeighborsClassifier(n_neighbors=10)
    #clf = LinearSVC()
    
    clf.fit(X_train, y_train)
    
    print len(y_train)
    print len(y_test)
    
    pred = clf.predict(X_test)
    
    #pred = ['0']* len(y_test)
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    total.append(score)
    
示例#58
0
    results = {
        'problem': [],
        'method': [],
        'score': []
    }

    if len(sys.argv) > 1 and sys.argv[1] == '--skip-train':
        results = pd.read_csv("./data/results.csv")
    else:
        for classification_dataset in classification_dataset_names:
            print("Starting", classification_dataset)

            X, y = fetch_data(classification_dataset, return_X_y=True, local_cache_dir='./data/')
            train_X, test_X, train_y, test_y = train_test_split(X, y)

            rf = RandomForestClassifier()
            lexRF = LexicaseForestClassifier()

            rf.fit(train_X, train_y)
            lexRF.fit(train_X, train_y)

            rf_score = rf.score(test_X, test_y)
            lexRF_score = lexRF.score(test_X, test_y)

            results['problem'] = results['problem'] + ([classification_dataset] * 2)
            results['method'] = results['method'] + ['RF', 'LexRF']
            results['score'].append(rf_score)
            results['score'].append(lexRF_score)

        results = pd.DataFrame(results)
        results.to_csv("./data/results.csv", index=False)