Пример #1
0
def ReprodIndividualsFromRF(list_indiv, max_id, options):

    list_indiv = list(list_indiv)
    rf = RandomForestClassifier(n_estimators=len(list_indiv))
    trees = list()
    for indiv in list_indiv:
        trees.append(indiv.clf)

    rf.estimators_ = trees
    rf.n_classes_ = trees[0].n_classes_
    rf.classes_ = trees[0].classes_

    new_dt = eqtree_rec_rf(rf,
                           0,
                           max_depth=options['max_depth'],
                           smallest_tree=False)

    new_id = max_id + 1

    indiv3 = genetic.individual(new_dt,
                                new_id,
                                type_rf=False,
                                alpha=options['alpha'],
                                evaluate_on_data=options['on_data'],
                                X=options['X'],
                                y=options['y'])

    return indiv3
Пример #2
0
def deserialize_random_forest(model_dict):
    model = RandomForestClassifier(**model_dict['params'])
    estimators = [deserialize_decision_tree(decision_tree) for decision_tree in model_dict['estimators_']]
    model.estimators_ = np.array(estimators)

    model.classes_ = np.array(model_dict['classes_'])
    model.n_features_ = model_dict['n_features_']
    model.n_outputs_ = model_dict['n_outputs_']
    model.max_depth = model_dict['max_depth']
    model.min_samples_split = model_dict['min_samples_split']
    model.min_samples_leaf = model_dict['min_samples_leaf']
    model.min_weight_fraction_leaf = model_dict['min_weight_fraction_leaf']
    model.max_features = model_dict['max_features']
    model.max_leaf_nodes = model_dict['max_leaf_nodes']
    model.min_impurity_decrease = model_dict['min_impurity_decrease']
    model.min_impurity_split = model_dict['min_impurity_split']

    if 'oob_score_' in model_dict:
        model.oob_score_ = model_dict['oob_score_']
    if 'oob_decision_function_' in model_dict:
        model.oob_decision_function_ = model_dict['oob_decision_function_']

    if isinstance(model_dict['n_classes_'], list):
        model.n_classes_ = np.array(model_dict['n_classes_'])
    else:
        model.n_classes_ = model_dict['n_classes_']

    return model
Пример #3
0
def build_classifier(trees):
    def build_decision_tree(t):
        dt = DecisionTreeClassifier(random_state=0)
        dt.n_features_ = t.n_features
        dt.n_outputs_ = t.n_outputs
        dt.n_classes_ = t.n_classes[0]
        dt.classes_ = np.array([x for x in range(dt.n_classes_)])
        dt.tree_ = t
        return dt

    if len(trees) > 1:
        clf = RandomForestClassifier(random_state=0, n_estimators=len(trees))
        clf.estimators_ = [build_decision_tree(t) for t in trees]
        clf.n_features_ = trees[0].n_features
        clf.n_outputs_ = trees[0].n_outputs
        clf.n_classes_ = trees[0].n_classes[0]
        clf.classes_ = np.array([x for x in range(clf.n_classes_)])
    else:
        clf = build_decision_tree(trees[0])
    return clf
Пример #4
0
def train_random_forest_classifier(training_data,
                                   n_est=100,
                                   use_bag_of_words=False):
    global Training_bag_of_words_features
    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=nltk.word_tokenize,
                                 preprocessor=None,
                                 stop_words=stopwords.words('english'),
                                 max_features=10000)
    forest = RandomForestClassifier(n_estimators=n_est,
                                    class_weight="balanced")
    forest.classes_ = [0, 1]
    if use_bag_of_words:
        if Training_bag_of_words_features == []:
            Training_bag_of_words_features = generate_bag_of_word_features(
                training_data[:, -2])
        forest = forest.fit(Training_bag_of_words_features,
                            training_data[:, -1].astype('float'))
        scores = cross_val_score(forest,
                                 Training_bag_of_words_features,
                                 training_data[:, -1].astype('float'),
                                 cv=5,
                                 scoring='roc_auc')
    else:
        forest = forest.fit(
            training_data[:, selected_features].astype('float'),
            training_data[:, -1].astype('float'))
        scores = cross_val_score(
            forest,
            training_data[:, selected_features].astype('float'),
            training_data[:, -1].astype('float'),
            cv=5,
            scoring='roc_auc')
    print("Scores gotten using Random Forest (# of estimators=" + str(n_est) +
          ")")
    print(scores)
    print(np.mean(scores))
    return forest, np.mean(scores)
Пример #5
0
def run_on_feature_union():
    load_word_embeddings()
    (training_data, _) = read_lines_from_file('data/filtered_features.csv')
    training_data = np.array(training_data)

    #training_data = generate_normalized_data(training_data)
    clf = RandomForestClassifier(n_estimators=100, class_weight="balanced")
    clf.classes_ = [0, 1]
    adaboost = AdaBoostClassifier(n_estimators=100)
    adaboost.classes_ = [0, 1]
    svm_clf = svm.SVC(probability=True)
    svm_clf.classes_ = [0, 1]
    randomized = ExtraTreesClassifier(n_estimators=45,
                                      max_depth=None,
                                      min_samples_split=2,
                                      class_weight="balanced")
    randomized.classes_ = [0, 1]
    pipeline = Pipeline([
        ('features',
         FeatureUnion(
             [('numeric_features', NumericFeaturesExtractor()),
              ('bag_of_words_features', BagOfWordsExtractor()),
              ('w2v_features', Word2VecExtractor())],
             transformer_weights={
                 'numeric_features': 0.5,
                 'bag_of_words_features': 0.9,
                 'w2v_features': 1.0,
             })), ('clf', clf)
    ])
    pipeline.fit(training_data, training_data[:, -1].astype('float'))
    scores = cross_val_score(pipeline,
                             training_data,
                             training_data[:, -1].astype('float'),
                             cv=10,
                             scoring='roc_auc')
    print(scores)
    print(np.mean(scores))
Пример #6
0
index = 0
for tweet in tweets:
    words = tweet.split(' ')
    for word in words:
        tweetX[index][bagOfWords[word]] += 1
    index += 1

#75-25 Train-Test split results
X_train, X_test, y_train, y_test = train_test_split(tweetX,
                                                    labels,
                                                    test_size=0.25,
                                                    random_state=42)

rf = RandomForestClassifier(n_estimators=50)
rf.classes_ = [0, 1, -1]
rf.n_classes_ = 3
rf.fit(X_train, y_train)

a = rf.predict(X_test)

print 'Train-Test Split Results'
print np.sum(a == y_test) * 100.0 / len(y_test)

#Cross Validation Results
'''
rf = RandomForestClassifier(n_estimators=50)
rf.classes_ = [0,1,-1]
rf.n_classes_  = 3
rf.fit(tweetX,labels)
Пример #7
0
def Random_Forest_Classifier(X_train, Y_train):
    clf_rfc = RandomForestClassifier(n_estimators=100)
    clf_rfc.classes_ = 2
    clf_rfc = clf_rfc.fit(X_train, Y_train)
    return clf_rfc
Пример #8
0
                feature_importance = []
                for k in range(5):
                    model = AdaBoostClassifier(n_estimators=100, learning_rate=1,
                                               base_estimator=DecisionTreeClassifier(max_depth=1, class_weight={True: 0.8, False: 0.2}))
                    # model1 = RandomForestClassifier(n_estimators=100, max_depth=1, class_weight={True: 0.8, False: 0.2})
                    # model2 = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=1, class_weight={True: 0.8, False: 0.2}),
                    #                            n_estimators=100, max_samples=1.0, max_features=1.0)
                    # model1 = LinearSVM(C=1, class_weight={True: 0.8, False: 0.2})
                    # model1 = SGDClassifier(shuffle=True, loss="log", class_weight={True: 0.8, False: 0.2})
                    model1 = SVC(kernel="rbf", probability=True, class_weight={True: 0.8, False: 0.2})
                    # model2 = SVC(kernel="poly", probability=True, class_weight={True: 0.8, False: 0.2}, degree=2)
                    # model1 = LinearDiscriminantAnalysis()
                    # svm.classes_ = [True, False]
                    # pre_model = BernoulliRBM(learning_rate=0.1, n_components=10, n_iter=20)
                    # model3 = Pipeline(steps=[("rbm", pre_model), ("svm", svm)])
                    model.classes_ = [True, False]
                    model1.classes_ = [True, False]
                    # model2.classes_ = [True, False]

                    positive_train_labels = [i for i in range(len(train_1_labels)) if train_1_labels[i]]
                    positive_train_labels = list(np.random.choice(positive_train_labels, size=int(len(positive_train_labels)*0.8), replace=False))
                    # positive_test_labels = [i for i in range(len(test_1_labels)) if test_1_labels[i]]
                    negative_train_labels = list(np.random.choice([i for i in range(len(train_1_labels)) if i not in positive_train_labels], replace=False, size=len(positive_train_labels)))
                    # negative_test_labels = list(np.random.choice([i for i in range(len(test_1_labels)) if i not in positive_test_labels], replace=False, size=len(positive_test_labels)))

                    # test_data = [test_data[i] for i in positive_test_labels + negative_test_labels]
                    train_data1 = [train_data[i] for i in positive_train_labels + negative_train_labels]

                    # test_1_labels = [test_1_labels[i] for i in positive_test_labels + negative_test_labels]
                    train_1_labels1 = [train_1_labels[i] for i in positive_train_labels + negative_train_labels]
                    model.fit(train_data1, train_1_labels1)
Пример #9
0
    # 原始森林
    RF = RandomForestClassifier(n_estimators=rfSize)
    RF.fit(train_x, train_y)
    RF_path = model_path + '/RF.m'
    joblib.dump(RF, RF_path)

    # BRAF
    rf3 = RandomForestClassifier(n_estimators=rf2_size)
    rf3.fit(training_c_x, training_c_y)
    rf3_path = model_path + '/rf3.m'
    joblib.dump(rf3, rf3_path)

    RF1 = RandomForestClassifier(n_estimators=rfSize)
    Gobaltree = rf1.estimators_ + rf3.estimators_
    RF1.estimators_ = Gobaltree
    RF1.classes_ = rf1.classes_
    RF1.n_classes_ = rf1.n_classes_
    RF1.n_outputs_ = rf1.n_outputs_
    RF1_path = model_path + '/braf.m'
    joblib.dump(RF1, RF1_path)

    # DBRF
    RF2 = RandomForestClassifier(n_estimators=rfSize)
    mod_Gobaltree = rf1.estimators_ + rf2.estimators_
    RF2.estimators_ = mod_Gobaltree
    RF2.classes_ = rf2.classes_
    RF2.n_classes_ = rf2.n_classes_
    RF2.n_outputs_ = rf2.n_outputs_
    RF2_path = model_path + '/borderlindbscan.m'
    joblib.dump(RF2, RF2_path)
Пример #10
0
                             class_weight='balanced',
                             criterion='entropy',
                             max_depth=None,
                             max_features='log2',
                             max_leaf_nodes=None,
                             min_impurity_split=1e-07,
                             min_samples_leaf=1,
                             min_samples_split=2,
                             min_weight_fraction_leaf=0.0,
                             n_estimators=100,
                             n_jobs=-1,
                             oob_score=False,
                             random_state=None,
                             verbose=0,
                             warm_start=False)
clf.classes_ = classes
skf = StratifiedKFold(n_splits=2)
"""
for train_index, test_index in skf.split(X, y):
	X_train, X_test = X[train_index], X[test_index]
	y_train, y_test = y[train_index], y[test_index]
	
	clf.fit(X_train, y_train)
	proba = clf.predict_proba(X_test)
	print(proba[:20])
	print(clf.score(X_test, y_test))
	print(log_loss(y_test, proba))
"""

clf.fit(X, y)
def main():

    # Set the logger
    logging_dir = args.log_dir
    engage_mode = args.engage_mode
    try:
        os.makedirs(logging_dir)
    except OSError as e:
        print("loggiing directory exists")

    log_file = 'train_RandomForest_round3_' + str(engage_mode) + '.log'

    utils.set_logger(os.path.join(logging_dir, log_file))
    logging.info(
        'model = sklearn.ensemble.RandomForestClassifier, engagement mode: Retweet'
    )
    logging.info('Using tf_idf values of tweet tokens and other features')
    logging.info('Does not use tweet token embeddings and user ids')
    logging.info('training set: '.format(args.train_file))

    batch_size = args.batch_size
    data_dir = args.data_dir
    vocab_file = args.vocab_file

    logging.info('batch size = {}'.format(batch_size))

    #load the vocab file with document count frequency for each token
    vocab_file_path = os.path.join(data_dir, vocab_file)
    vocab_handle = bz2.open(vocab_file_path, 'r')
    term_document_count = pickle.load(vocab_handle)

    #train_file = "train_first_1000.tsv"
    train_file = args.train_file
    trainfile_path = os.path.join(data_dir, train_file)

    #train_file = "train_first_1000.tsv"
    val_file = args.eval_file
    valfile_path = os.path.join(data_dir, val_file)

    TF_IDF = tf_idf.tf_idf(term_document_count)

    print("engagement mode: {}, value: {}".format(
        engage_mode, engagement_dict[engage_mode]))

    rng = np.random.RandomState(2020)

    #train(model, data_gen, TF_IDF, engage_mode)
    if args.mode == 'train_eval':
        model = RandomForestClassifier(warm_start=True, n_estimators=1)
        model.classes_ = [0, 1]
        TR_train = tweetrecords.TweetRecords(trainfile_path, batch_size,
                                             engage_mode)
        data_gen_train = TR_train.tokens_target_gen()
        logging.info('Starting Training')
        train_eval(model,
                   data_gen_train,
                   valfile_path,
                   TF_IDF,
                   engagement_dict[engage_mode],
                   args,
                   train=True)
        TR_train.close_file()
        logging.info('Finished Training')
    elif args.mode == 'eval':
        pretrained_file_path = os.path.join(args.saved_model_dir,
                                            args.pretrained_model)
        model = pickle.load(open(pretrained_file_path, 'rb'))
        eval(model, valfile_path, TF_IDF, engagement_dict[engage_mode], args)
        logging.info('Finished Evaluation')