Пример #1
0
def main():

    # Logistic regression or Random forest
    output_type = sys.argv[1]

    raw_file_path = sys.argv[2]

    path_to_output = sys.argv[3]

    raw_input_rdd = sc.textFile(
        raw_file_path, minPartitions=32).map(lambda line: line.encode("utf-8"))

    process_data = raw_input_rdd.map(lambda line: replacetab(line))

    df_for_pp = create_df(process_data)

    pp = PreProcess(df_for_pp)

    preprocessed_data = pp.preprocess_data()

    data = None

    if output_type == "rf":
        data = prep_rf(preprocessed_data)

    # write to file the data variable
    data.persist(StorageLevel(True, True, False, False, 1))
    data.write.parquet(path_to_output + "final_" + output_type +
                       "_data.parquet")
Пример #2
0
 def update_recommendations(self):
     """
     Note:
     """
     if self.pairs_served < 1:
         self.recommendations = {}
         for metric in self.metrics:
             self.df1 = PreProcess(self.df1).filter_df(metric)
             self.df2 = PreProcess(self.df2).filter_df(metric)
             self.update_similarity_matrix('euclidean')
             # reset the df to their orignal version for the next iteration
             self.df1 = self.orginal_df1
             self.df2 = self.orginal_df2
             self.recommendations[metric] = np.argsort(
                 self.sim_mat[0])[-(self.num_matches):].tolist()
     return self.recommendations
Пример #3
0
from pre_processing import PreProcess
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn import tree
from sklearn.metrics import accuracy_score
import math
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

preprocess = PreProcess("data/train", "data/test")
preprocess.read_train_test_data()
max_features = 1000
feat_list = []
acc_list = []
while (max_features < 41000):
    count_vect = CountVectorizer(stop_words='english',
                                 max_features=max_features)
    X_train_fit = count_vect.fit(preprocess.training_data)
    X_train_counts = X_train_fit.transform(preprocess.training_data)
    tfIdfFit = TfidfTransformer(use_idf=True, norm='l2',
                                sublinear_tf=True).fit(X_train_counts)
    preprocess.traintfIdf = tfIdfFit.transform(X_train_counts)
    X_test_counts = X_train_fit.transform(preprocess.test_data)
    preprocess.testtfIdf = tfIdfFit.transform(X_test_counts)

    nb_clf = MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None)
    nb_clf.fit(preprocess.traintfIdf, preprocess.train_target)
Пример #4
0
    print("-------------------------------------\n")
    KNN_5 = NearestNeighbors(train, 5)
    KNN_5.train(train)
    res_k5 = KNN_5.test(test)

    return res_k5


if __name__ == '__main__':

    start_time = time.process_time()

    print("PRE-PROCESSING DATA...")
    print("-------------------------------------\n")

    pre_process = PreProcess()
    folder = 'data/'
    data = pre_process.pre_process(folder)

    splits = [.9, .8, .7, .6]
    '''
    The following lists will contain lists, each with a result, in the format:
    [total, correct, accuracy, precision, recall, f1_score]
    '''
    nb_results = []
    knn1_results = []
    knn5_results = []

    for split in splits:
        print("----------------------")
        print("|| SPLIT = {}/{} ||".format(split, round(1 - split, 1)))
Пример #5
0
from sklearn.grid_search import GridSearchCV
from pre_processing import PreProcess
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

preprocess = PreProcess("data/train", "data/test")
preprocess.read_train_test_data()
preprocess.getTfIdf()

ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                            n_estimators=500,
                            learning_rate=1)

scores = cross_val_score(ab_clf,
                         preprocess.traintfIdf,
                         preprocess.train_target,
                         cv=3)
print("the cross validated accuracy on training is " + str(scores))
print(
    "the cross validated accuracy(standard deviation) on training is: %0.4f (+/- %0.4f)"
    % (scores.mean(), scores.std() * 2))

ab_clf.fit(preprocess.traintfIdf, preprocess.train_target)
# finding the training and test predictions
train_pred_ab = ab_clf.predict(preprocess.traintfIdf)
test_pred_ab = ab_clf.predict(preprocess.testtfIdf)

ab_train_accuracy = metrics.accuracy_score(preprocess.train_target,
                                           train_pred_ab)
Пример #6
0
# ftest = open("data/svm_wrong.dat", 'r')
# svm = np.loadtxt(ftest, delimiter=',')
#
# ftest = open("data/softmax_wrong.dat", 'r')
# sm = np.loadtxt(ftest, delimiter=',')
#
#
# ftest = open("data/nb_wrong.dat", 'r')
# nb = np.loadtxt(ftest, delimiter=',')
#
# svm = set(svm)
# sm = set(sm)
# nb = set(nb)
# venn3([svm, sm, nb], ('SVM', 'Softmax', 'Naive Bayes'))
# plt.show()
preprocess = PreProcess("data/train", "data/test")
preprocess.read_train_test_data()
preprocess.getTfIdf()
preprocess.polarity_POS_features()
pca = PCA(n_components=2)

X_r = pca.fit(preprocess.traintfIdf.toarray()).transform(preprocess.traintfIdf.toarray())
print "The number of features " + str(pca.n_components_)
target_names = ['Bad', 'Neutral', 'Good']
plt.figure()
colors = ['navy', 'turquoise', 'darkorange']
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(X_r[preprocess.train_target == i, 0], X_r[preprocess.train_target == i, 1], color=color, alpha=.8, lw=lw,
                label=target_name)
Пример #7
0
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from pre_processing import PreProcess
from learn_preferences import LearnPreferences


df_sea = pd.read_csv('data/metrics/seattle_test.csv', index_col=0)
df_sf = pd.read_csv('data/metricssanfran_test.csv', index_col=0)

sea_ref = df_sea[['city', 'state', 'street', 'finishedsqft', 
             'bedrooms', 'bathrooms', 'trans_score', 'walkscore_score']]
sf_ref = df_sf[['city', 'state', 'street', 'finishedsqft', 
             'bedrooms', 'bathrooms', 'trans_score', 'walkscore_score']]

# create the PreProcesss objects
prep_sf = PreProcess(df_sf)
prep_sea = PreProcess(df_sea)

# drop the unneccessary columns, clean_up NA's and normalize use in the recommender
prep_sf.drop_columns()
sf = prep_sf.preprocess_df()
sf = prep_sf.normalize_columns()
prep_sea.drop_columns()
sea = prep_sea.preprocess_df()
sea = prep_sea.normalize_columns()

# specify the metrics to use for the similarity matrix 
metrics = ['walk_distance', 'space_distance']

# init a LearnPreferences object with seed house of SanFran index 3, and use 50 listings 
lp = LearnPreferences(sf, sea, sf_ref, sea_ref, metrics, 3, 50)
Пример #8
0
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

from pre_processing import PreProcess

#define img-processing parameters
low_t = 10
high_t = 100
max_length = 5

SRC_DIR = ''

data_process = PreProcess(SRC_DIR)
input_data_1 = data_process.process_image(low_t, high_t, max_length)

SRC_DIR = ''
data_process = PreProcess(SRC_DIR)
input_data_2 = data_process.process_image(low_t, high_t, max_length)

input_data_1.append(input_data_2)
imgdata = input_data_1
x = imgdata.drop('Class', axis=1)
y = imgdata['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)