def start_at(iteration):
    # Root-Verzeichnis aus Parametern lesen und Pfade initialisieren
    count_of_iterations = iteration - 1
    max_iterations = 1000
    all_start = time.time()

    while not count_of_iterations == max_iterations:

        start = time.time()
        count_of_iterations = count_of_iterations + 1
        print("Starting new iteration... Current:", str(count_of_iterations))

        if count_of_iterations == iteration:
            feature_selection.select_features_at(iteration)
        else:
            feature_selection.select_features()

        # Modell trainieren
        model_training.train_and_save_model()

        # Evaluation ausführen
        evaluation.write_evaluation(count_of_iterations, False)

        print('Iteration ', str(count_of_iterations), 'finished. Duration: ', str(round((time.time() - start) / 60, 2)),
              'min')

    print("all", str(max_iterations), "iterations finished.")
    print("Overall duration:", str(round((time.time() - all_start) / 60, 2)), 'min')
Exemplo n.º 2
0
def main(argv):
    print "Phishing URL predictor - Naive Bayes approach"
    training_file = argv[1]
    option = "random"
    if len(argv) > 2:
        option = argv[2]

    if option == "random":
        #Random shuffle run
        split = 0.8
        if len(argv) == 4:
            split = float(argv[3])

        data = read_data(training_file)
        # Uncomment the following line to filter features.

        data = filter_features(
            data,
            fs.select_features(data, {
                "method": "info_gain",
                "num_features": 14
            }))
        X_train, y_train, X_test, y_test = split_train_test(data, split)
        class_prob, class_feature_value_count = train(X_train, y_train)
        store(class_prob, class_feature_value_count)
        accuracy = predict(X_test, y_test, class_prob,
                           class_feature_value_count)
        print "\n"
        print "Random test-train split. Training ratio = " + str(split) + "."
        print "\n"
        print "Accuracy = " + str(accuracy) + " %."
        print "\n"

    elif option == "cv":
        k = 5
        if len(argv) == 3:
            k = int(argv[2])

        kf = KFold(n_splits=k)
        data = read_data(training_file)
        data = filter_features(
            data,
            fs.select_features(data, {
                "method": "info_gain",
                "num_features": 25
            }))
        np.random.shuffle(data)
        X = data[:, :-1]
        y = data[:, -1]
        accuracy = 0.0
        for train_idx, test_idx in kf.split(data):
            class_prob, class_feature_value_count = train(
                X[train_idx], y[train_idx])
            accuracy += predict(X[test_idx], y[test_idx], class_prob,
                                class_feature_value_count)
        print "Cross-validated. Folds = " + str(k) + "."
        print "Average Accuracy over different folds = " + str(accuracy / k)

    else:
        print "Illegal options set. Use either 'random' or 'cv'"
Exemplo n.º 3
0
def main():
    """
    Main function
  """
    # Extract features
    if not feature_file_exists():
        extract_features()

    # Select features
    if not select_feature_file_exists():
        select_features()

    # Train model
    train()
Exemplo n.º 4
0
    'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi',
    'shared_receipt_with_poi'
]

data_dict = removeNaN(data_dict)
finance = ['salary']  #change finance to get more outliers
data_dict = removeOutlier(data_dict, finance)
data_dict, new_feature = create_new_feature(data_dict)
features_list.append(new_feature)
k = 1
index = []
k_list = []
i = 0
while k < 20:
    index.append(k)
    new_features_list = list(select_features(data_dict, features_list, k))
    new_features_list.insert(0, 'poi')

    my_dataset = data_dict
    data = featureFormat(data_dict, new_features_list)
    labels, features = targetFeatureSplit(data)

    # Craeting, training and validationg GaussianNB classifier

    from sklearn.naive_bayes import GaussianNB
    clf = GaussianNB()

    from sklearn.cross_validation import train_test_split
    features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)
Exemplo n.º 5
0
def main():
  # Read the data from the text files
  begin = time.time()
  vocab, train_raw, test_raw = read.read_tweets("../training_set_tweets.txt", "../test_set_tweets.txt")
  print "Num of Train users:", len(train_raw), "Num of Test users:", len(test_raw)
  print "Read data:", time.time() - begin

  # Preprocess the data
  begin = time.time()
  vocab, bigrams, train_word, test_word, train_char, test_char = preprocessing.preprocess(train_raw, test_raw)
  print "Preprocessed the data", time.time() - begin

  return
  # Assign ids to words
  vocab_list = list(vocab)
  vocab_list.sort()
  begin = time.time()
  vocab_dict = {}
  for i in range(len(vocab_list)):
      vocab_dict[vocab_list[i]] = i
  print "Assigned ids to words:", time.time() - begin

  # Build train and test set
  num_full_feats = len(vocab_list) + 10
  num_train_tweets = 0
  num_test_tweets = 0
  # num_train_tweets = np.count_nonzero(~np.isnan(train))
  # num_test_tweets = np.count_nonzero(~np.isnan(test))
  for author_id in train:
      num_train_tweets += len(train[author_id])
  for author_id in test:
      num_test_tweets += len(test[author_id])
  X_train = np.zeros((num_train_tweets, num_full_feats))
  y_train = np.zeros(num_train_tweets)
  X_test = np.zeros((num_test_tweets, num_full_feats))
  y_test = np.zeros(num_test_tweets)

  # Build train and test set
  num_full_feats = len(vocab_list) + 10
  num_train_tweets = 0
  num_test_tweets = 0
  # num_train_tweets = np.count_nonzero(~np.isnan(train))
  # num_test_tweets = np.count_nonzero(~np.isnan(test))
  for author_id in train_word:
      num_train_tweets += len(train_word[author_id])
  for author_id in test_word:
      num_test_tweets += len(test_word[author_id])
  X_train = np.zeros((num_train_tweets, num_full_feats))
  y_train = np.zeros(num_train_tweets)
  X_test = np.zeros((num_test_tweets, num_full_feats))
  y_test = np.zeros(num_test_tweets)

  count = 0

  for author_id in train_word:
      for tweet in train_word[author_id]:
          X_train[count, :] = features.get_full_feats(tweet, vocab_dict)
          y_train[count] = author_id
          count += 1
  print count

  count = 0
  for author_id in test_word:
      for tweet in test_word[author_id]:
          X_test[count, :] = features.get_full_feats(tweet, vocab_dict)
          y_test[count] = author_id
          count += 1
  print count

  begin = time.time()
  feats = feature_selection.select_features(X_train, y_train, np.zeros(num_full_feats), 100, "dia")
  X_train = X_train[:, feats]
  X_test = X_test[:, feats]
  print "Features selected:", time.time() - begin

  begin = time.time()
  clf = model.train(X_train, y_train)
  acc, my_acc, preds, scores = model.test(clf, X_test, y_test)
  print 'time:', time.time()-begin, 'acc:', acc, 'my_acc:', my_acc
  print 'preds:', preds
  print 'scores:', scores

  print (preds == y_test)[:100]
  print np.count_nonzero(scores > 0)
  print np.count_nonzero(scores < 0)
    folder_plots = 'plots/'
    os.makedirs(folder_plots, exist_ok=True)

    X = tensor_data
    y = annotations.reset_index(drop=True)

    X = extract_df_with_features(X, y, attributes, [target_class], data_folder)
    # X = extract_basic_features(X, y, attributes)
    y_target = y[target_class]
    X_ids = X['recordingID']
    X = X.drop(['recordingID', target_class], axis=1)

    # select the features with feature selection
    selected_features = select_features(
        X, y_target, 0.01, attributes,
        data_folder)  # doesn't work with 10% = (0.1)
    for f in selected_features:
        if not f in X.columns.values:
            selected_features = selected_features.drop(f)
    X = X[selected_features]
    print("Number of selected features:", len(selected_features))

    # add duration as a feature
    #X['duration'] = y['duration']
    # X = X[['duration']]  # only duration as feature

    y.loc[:, 'score'] = (1 - y.loc[:, target_class]) / y.loc[:, 'duration']
    score_values_nozeros = y[y.score > 0].score.values
    mu, std = norm.fit(score_values_nozeros)
    score_values = y.score.values
Exemplo n.º 7
0
import pandas as pd
from sklearn.model_selection import train_test_split
from feature_selection import select_features
#our libraries
import preprocessing
import random_forests
import ann

dataset = pd.read_csv('data/student-mat.csv', delimiter=";")
dataset = preprocessing.preprocess(dataset)
x, y = preprocessing.split_attributes(dataset, 3)
x = select_features(x, 16)
y=preprocessing.bucketize_y(y,2)

cols = list(x.columns)
cols.extend(y.columns)

new_data = pd.DataFrame(data=x.join(y), columns=cols)

#splitting train and test
x_train,x_test,y_train,y_test = train_test_split(x,y.values[:,2],test_size=.2,random_state=0)

cm_rf = random_forests.classify(x_train,x_test,y_train,y_test)

#Neural Network
history = ann.build_and_train_net(x_train,y_train,x_test,y_test)
cm_ann = ann.test_classifier(x_test,y_test)
    attributes = ['bvp', 'gsr', 'hrv', 'ibi', 'tmp']

    folder_plots = 'plots/'
    os.makedirs(folder_plots, exist_ok=True)

    X = tensor_data
    y = annotations.reset_index(drop=True)
    X = extract_df_with_features(X, y, attributes, [target_class], data_folder)
    #X = extract_basic_features(X, y, attributes)
    y_target = y[target_class]
    X_ids = X['recordingID']
    X = X.drop(['recordingID', target_class], axis=1)

    # select the features with feature selection
    selected_features = select_features(X, y_target, 0.025, attributes,
                                        data_folder)
    for f in selected_features:
        if not f in X.columns.values:
            selected_features = selected_features.drop(f)
    X = X[selected_features]

    # add duration as a feature
    # X.loc[:, 'duration'] = y.loc[:, 'duration']
    #X = X[['duration']]

    y.loc[:, 'score'] = (1 - y.loc[:, target_class]) / y.loc[:, 'duration']
    score_values_nozeros = y[y.score > 0].score.values
    mu, std = norm.fit(score_values_nozeros)
    score_values = y.score.values
    y.loc[:, 'score_normalized'] = gaussian(score_values, mu, std)
    y.loc[:, 'score_norm_binary'] = pd.cut(y.loc[:, 'score_normalized'],
imgpr.create_masks_and_nrrds(dataPath)

# Feature extraction
img2use = ["T2"]
mask2use = ["M+"]
paramsPath = "../Params.yaml"
fextr.extract_features_from_all(dataPath, img2use, mask2use, paramsPath, selectionFeaturesPath, manualFeaturesPath)


# Feature selection
FSmethod = 'MRMR'
FSparams = {'nFeatures': 15, 
            'internalFEMethod': 'MID', 
            'nBins': 4, 
            'discStrategy': 'kmeans'}
selectedFeatures = fesel.select_features(FSmethod, FSparams, selectionFeaturesPath, manualFeaturesPath)
print(f'Features selected by {FSmethod}:')
print(selectedFeatures)


# Prediction model
MLmethod = 'RFreg'
rfParams = {'n_estimators': [5, 10, 15, 25, 50, 75], 
            'max_depth': [None, 1, 3, 5, 10, 15],
            'max_features': [0.33, 0.67, 1.0, 'sqrt']}  
scoring = 'r2'
yTrueTest, yPredRegTest, yTrueVal, yPredRegVal, MLparams = mlpred.create_evaluate_model\
    (MLmethod, rfParams, selectedFeatures, selectionFeaturesPath, manualFeaturesPath, paramSearchResultsPath, optimizeParams=True, scoringOptiMetric=scoring)

# Write validation- and test- results to csv-file
mlpred.write_results_to_csv(predResultsPath, selectionFeaturesPath, FSmethod, FSparams, selectedFeatures, MLmethod, MLparams, yTrueTest, yPredRegTest, yTrueVal, yPredRegVal)