def classify(self, features, classifier=None): feature_union = ('feats', FeatureUnion(features)) if classifier == None: classifier = MultinomialNB() self.classifier = Pipeline([feature_union, ('classifier', classifier)]) self.printer = Printer('Model Fitting', self.show_fitting) self.classifier.fit(self.X_train, self.Y_train) self.printer.duration()
def classify(self, features, classifier=None): feature_union = ('feats', FeatureUnion(features)) if classifier == None: classifier = DecisionTreeClassifier(min_samples_leaf=2, max_depth=50) self.classifier = Pipeline([feature_union, ('classifier', classifier)]) self.printer = Printer('Model Fitting', self.show_fitting) self.classifier.fit(self.X_train, self.Y_train) self.printer.duration()
def classify(self, features, classifier=None): self.tokenize() train_development_split = self.data.amount_train development_test_split = self.data.amount_train + self.data.amount_development self.X_train = self.X[:train_development_split] self.Y_train = self.Y[:train_development_split] self.X_development = self.X[ train_development_split:development_test_split] self.Y_development = self.Y[ train_development_split:development_test_split] self.X_test = self.X[development_test_split:] if self.data.avoid_skewness: Y_train = np.argmax(self.Y_train, axis=1) Y_train = [self.labels_dict_rev[int(i)] for i in list(Y_train)] self.X_train, self.Y_train = unskewedTrain(self.X_train, self.Y_train, Y_train) self.X_train = np.array(self.X_train) self.Y_train = np.array(self.Y_train) ##CHANGE OPTIONS HERE self.model = Sequential() self.model.add( Dense(512, input_shape=(self.feature_length, self.feature_dimensions))) self.model.add(Flatten()) self.model.add(Dense(6)) self.model.add(Activation('relu')) # self.model.add(Dropout(0.2)) # # self.model.add(Dense(128)) # self.model.add(Activation('relu')) # self.model.add(Dropout(0.1)) # self.model.add(Dense(6, input_dim=self.feature_length,))) # self.model.add(Activation('sigmoid')) self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Train the model self.printer = Printer('Model Fitting', self.show_fitting) self.model.fit(self.X_train, self.Y_train, epochs=50, batch_size=128, validation_split=0.2) self.printer.duration()
def classify(self, features, classifier=None): self.tokenize() train_development_split = self.data.amount_train development_test_split = self.data.amount_train + self.data.amount_development self.X_train = self.X[:train_development_split] self.Y_train = self.Y[:train_development_split] self.X_development = self.X[ train_development_split:development_test_split] self.Y_development = self.Y[ train_development_split:development_test_split] self.X_test = self.X[development_test_split:] if self.data.avoid_skewness: Y_train = np.argmax(self.Y_train, axis=1) Y_train = [self.labels_dict_rev[int(i)] for i in list(Y_train)] self.X_train, self.Y_train = unskewedTrain(self.X_train, self.Y_train, Y_train) self.X_train = np.array(self.X_train) self.Y_train = np.array(self.Y_train) self.word_embeddings_layer, self.word_embeddings_index = readWordEmbeddings( self.data.languages, self.data.response_variable) if self.word_embeddings_layer == None: self.createWordEmbeddings() self.printDataInformation() ##CHANGE OPTIONS HERE self.model = Sequential() self.model.add(self.word_embeddings_layer) self.model.add(Dropout(0.2)) self.model.add(LSTM(self.word_embeddings_dim)) self.model.add(Dense(self.Y.shape[1], activation='sigmoid')) self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Train the model self.printer = Printer('Model Fitting', self.show_fitting) self.model.fit(self.X_train, self.Y_train, epochs=5, batch_size=128, validation_split=0.2) self.printer.duration()
def classify(self, features, classifier=None): feature_union = ('feats', FeatureUnion(features)) if classifier == None: classifier = SGDClassifier(loss='hinge', random_state=42, max_iter=50, tol=None) self.classifier = Pipeline([feature_union, ('classifier', classifier)]) print(self.classifier) self.printer = Printer('Model Fitting', self.show_fitting) self.classifier.fit(self.X_train, self.Y_train) self.printer.duration()
def __init__(self, k, method, data, features, new_classifier, print_details, show_fitting): self.k = k self.kf = KFold(n_splits=self.k) self.print_details = print_details self.show_fitting = show_fitting self.printer = Printer(str(self.k) + '-Fold validation') self.method = method self.data = data self.features = features self.new_classifier = new_classifier self.validation()
def classify(self, features, classifier=None): feature_union = ('feats', FeatureUnion( features )) if classifier == None: classifier = SGDClassifier(loss='hinge', random_state=42, max_iter=50, tol=None) self.classifier = Pipeline([ feature_union, ('classifier', classifier) ]) self.printer = Printer('Model Fitting', self.show_fitting) #self.X_train, X_none, self.Y_train, Y_none = train_test_split(self.X_train, self.Y_train, test_size=0.2, random_state=42) #self.printer.labelDistribution(self.Y_train, '80%') self.classifier.fit(self.X_train, self.Y_train) self.printer.duration()
class DecisionTree: X_train = [] Y_train = [] X_development = [] Y_development = [] X_test = [] Y_predicted = [] labels = [] features = [] def __init__(self, data, predict_method, show_fitting): self.X_train = data.X_train self.Y_train = data.Y_train self.X_development = data.X_development self.Y_development = data.Y_development self.X_test = data.X_test self.labels = data.labels self.predict_method = predict_method self.show_fitting = show_fitting def classify(self, features, classifier=None): feature_union = ('feats', FeatureUnion(features)) if classifier == None: classifier = DecisionTreeClassifier(min_samples_leaf=2, max_depth=50) self.classifier = Pipeline([feature_union, ('classifier', classifier)]) self.printer = Printer('Model Fitting', self.show_fitting) self.classifier.fit(self.X_train, self.Y_train) self.printer.duration() def evaluate(self): if self.X_development: self.Y_development_predicted = self.classifier.predict( self.X_development) if self.X_test: self.Y_test_predicted = self.classifier.predict(self.X_test) self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics( self.Y_development, self.Y_development_predicted, self.labels) def printBasicEvaluation(self): self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Basic Evaluation") def printClassEvaluation(self): self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels)
class NaiveBayes: X_train = [] Y_train = [] X_development = [] Y_development = [] X_test = [] Y_predicted = [] labels = [] features = [] def __init__(self, data, show_fitting): self.X_train = data.X_train self.Y_train = data.Y_train self.X_development = data.X_development self.Y_development = data.Y_development self.X_test = data.X_test self.labels = data.labels self.show_fitting = show_fitting def classify(self, features, classifier=None): feature_union = ('feats', FeatureUnion(features)) if classifier == None: classifier = MultinomialNB() self.classifier = Pipeline([feature_union, ('classifier', classifier)]) self.printer = Printer('Model Fitting', self.show_fitting) self.classifier.fit(self.X_train, self.Y_train) self.printer.duration() def evaluate(self): if self.X_development: self.Y_development_predicted = self.classifier.predict( self.X_development) if self.X_test: self.Y_test_predicted = self.classifier.predict(self.X_test) self.accuracy, self.precision, self.recall, self.f1score = metrics( self.Y_development, self.Y_development_predicted, self.labels) def printBasicEvaluation(self): self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Basic Evaluation") def printClassEvaluation(self): self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels)
class Baseline: X_train = [] Y_train = [] X_development = [] Y_development = [] X_test = [] labels = [] features = [] def __init__(self, data, predict_method, show_fitting): self.X_train = data.X_train self.Y_train = data.Y_train self.X_development = data.X_development self.Y_development = data.Y_development self.X_test = data.X_test self.labels = data.labels self.predict_method = predict_method self.show_fitting = show_fitting self.classifier = Classifier() def classify(self, features, classifier=None): self.printer = Printer('Model Fitting', self.show_fitting) self.classifier.fit(self.X_train, self.Y_train) self.printer.duration() def evaluate(self): if self.X_development: self.Y_development_predicted = self.classifier.predict(self.X_development) if self.X_test: self.Y_test_predicted = self.classifier.predict(self.X_test) self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics(self.Y_development, self.Y_development_predicted, self.labels) def printBasicEvaluation(self): self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Basic Evaluation") def printClassEvaluation(self): self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels)
def validation(self): i = 0 for train_index, test_index in self.kf.split( self.data.X[:(self.data.amount_train + self.data.amount_development)]): i += 1 if self.print_details >= 4: n_printer = Printer(str(self.k) + '-Fold, Run: ' + str(i)) X_train, X_development = list(np.array( self.data.X)[train_index]), list( np.array(self.data.X)[test_index]) Y_train, Y_development = list(np.array( self.data.Y)[train_index]), list( np.array(self.data.Y)[test_index]) self.data.initialize(X_train, Y_train, X_development, Y_development) classifier = selectClassifier(self.method, self.data, self.predict_method, self.show_fitting) classifier.classify(self.features, self.new_classifier) classifier.evaluate() self.accuracy.append(classifier.accuracy) self.precision.append(classifier.precision) self.recall.append(classifier.recall) self.f1score.append(classifier.f1score) if self.print_details >= 5: classifier.printBasicEvaluation() if self.print_details >= 6: classifier.printClassEvaluation() if self.print_details >= 7: n_printer.confusionMatrix(classifier.Y_development, classifier.Y_development_predicted, self.data.labels) # writeResults(options.args.predict_languages, classifier.Y_development, classifier.Y_development_predicted, 'development') if self.print_details >= 4: n_printer.duration()
options = Options('System parameters', con) #Step 5: Read all custom arguments/options options.add(name='predict_languages', _type=str, _default='esdi', _help='specify which language you want to predict') #Step 6: Parse arguments options.parse() #Use random seed random.seed(options.args.random_seed) #Custom function to read language from input options.args.predict_languages = languages(options.args.predict_languages) #Print system printer = Printer('System') printer.system(options.args_dict) #Step 7: Create data with default arguments data = Data(options.args.avoid_skewness, options.args.data_folder, options.args.predict_label, options.args.data_method) #Step 8: Add all datasources and transform them to row(Y, X) format #Custom, should be self-made! #Step 8.1: Add the files or folders the data is preserved in (only if available) data.file_train = 'impression_data.csv' #Custom function data.languages = options.args.predict_languages #Load data into a file
class SVM: X_train = [] Y_train = [] X_development = [] Y_development = [] X_test = [] Y_predicted = [] labels = [] features = [] def __init__(self, data, predict_method, show_fitting): self.X_train = data.X_train self.Y_train = data.Y_train self.X_development = data.X_development self.Y_development = data.Y_development self.X_test = data.X_test self.labels = data.labels self.predict_method = predict_method self.show_fitting = show_fitting def classify(self, features, classifier=None): feature_union = ('feats', FeatureUnion(features)) if classifier == None: classifier = SGDClassifier(loss='hinge', random_state=42, max_iter=50, tol=None) self.classifier = Pipeline([feature_union, ('classifier', classifier)]) print(self.classifier) self.printer = Printer('Model Fitting', self.show_fitting) self.classifier.fit(self.X_train, self.Y_train) self.printer.duration() def evaluate(self): if self.X_development: self.Y_development_predicted = self.classifier.predict( self.X_development) print(self.X_development) #print(self.classifier.predict_proba(self.X_development)) #print(self.Y_development[:20], self.Y_development_predicted[:20]) if self.X_test: self.Y_test_predicted = self.classifier.predict(self.X_test) if self.predict_method == 'classification': self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics( self.Y_development, self.Y_development_predicted, self.labels) elif self.predict_method == 'regression': # self.Y_development_predicted = self.classifier.score(self.X_development, self.Y_development) # print(self.Y_development_predicted) self.mean_abs_err, self.mean_squ_err, self.r2score, self.kl_divergence = regressionMetrics( self.Y_development, self.Y_development_predicted, self.labels) def printBasicEvaluation(self): if self.predict_method == 'classification': self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Classification Evaluation") elif self.predict_method == 'regression': self.printer.regressionEvaluation(self.mean_abs_err, self.mean_squ_err, self.r2score, self.kl_divergence, "Regression Evaluation") def printClassEvaluation(self): self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels)
options.add(name='predict_languages', _type=str, _default='esdi', _help='specify which language you want to predict') #Step 6: Parse arguments options.parse() #Use random seed random.seed(options.args.random_seed) #Custom function to read language from input options.args.predict_languages = languages(options.args.predict_languages) #Print system printer = Printer('System') printer.system(options.args_dict) #Step 7: Create data with default arguments data = Data(options.args.avoid_skewness, options.args.data_folder, options.args.predict_label, options.args.data_method) #Step 8: Add all datasources and transform them to row(Y, X) format #Custom, should be self-made! #Step 8.1: Add the files or folders the data is preserved in (only if available) if options.args.predict_languages: data.file_train = options.args.data_folder + 'training/' # data.file_development = 'eng-trial.pickle' # data.file_test = 'eng-test.pickle'
class NeuralNetwork: word_embeddings_file = 'data/word_embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt' word_embeddings_dim = 200 word_embeddings_layer = None word_embeddings_index = {} labels = [] labels_dict = {} labels_dict_rev = {} Y = [] def __init__(self, data, show_fitting): self.data = data self.X = self.data.X self.labels = self.data.labels for i, label in enumerate(self.labels): self.labels_dict[label] = i self.labels_dict_rev[i] = label self.Y = [] for label in self.data.Y: self.Y.append(self.labels_dict[label]) self.show_fitting = show_fitting def tokenize(self): self.X_tokenized = TextTokenizer.tokenizeTweets(self.X) #all tweets! self.tokenizer = Tokenizer(split="|", ) self.tokenizer.fit_on_texts(self.X_tokenized) self.sequences = self.tokenizer.texts_to_sequences(self.X_tokenized) self.X = pad_sequences(self.sequences) self.Y = to_categorical(self.Y) def classify(self, features, classifier=None): self.tokenize() train_development_split = self.data.amount_train development_test_split = self.data.amount_train + self.data.amount_development self.X_train = self.X[:train_development_split] self.Y_train = self.Y[:train_development_split] self.X_development = self.X[ train_development_split:development_test_split] self.Y_development = self.Y[ train_development_split:development_test_split] self.X_test = self.X[development_test_split:] if self.data.avoid_skewness: Y_train = np.argmax(self.Y_train, axis=1) Y_train = [self.labels_dict_rev[int(i)] for i in list(Y_train)] self.X_train, self.Y_train = unskewedTrain(self.X_train, self.Y_train, Y_train) self.X_train = np.array(self.X_train) self.Y_train = np.array(self.Y_train) self.word_embeddings_layer, self.word_embeddings_index = readWordEmbeddings( self.data.languages, self.data.response_variable) if self.word_embeddings_layer == None: self.createWordEmbeddings() self.printDataInformation() ##CHANGE OPTIONS HERE self.model = Sequential() self.model.add(self.word_embeddings_layer) self.model.add(Dropout(0.2)) self.model.add(LSTM(self.word_embeddings_dim)) self.model.add(Dense(self.Y.shape[1], activation='sigmoid')) self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Train the model self.printer = Printer('Model Fitting', self.show_fitting) self.model.fit(self.X_train, self.Y_train, epochs=5, batch_size=128, validation_split=0.2) self.printer.duration() def evaluate(self): self.Y_development_predicted = self.model.predict(self.X_development) self.Y_development_predicted = np.argmax(self.Y_development_predicted, axis=1) self.Y_development_predicted = [ self.labels_dict_rev[int(i)] for i in list(self.Y_development_predicted) ] self.Y_development = np.argmax(self.Y_development, axis=1) self.Y_development = [ self.labels_dict_rev[int(i)] for i in list(self.Y_development) ] self.accuracy, self.precision, self.recall, self.f1score = metrics( self.Y_development, self.Y_development_predicted, self.labels) def printBasicEvaluation(self): self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Basic Evaluation") def printClassEvaluation(self): self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels) def printDataInformation(self): print('\n~~~Neural Network Distribution~~~\n') print('Found {} unique tokens.'.format(len(self.tokenizer.word_index))) print('Shape of data tensor: {}'.format(self.X.shape)) print('Shape of label tensor: {}\n'.format(self.Y.shape)) if len(self.word_embeddings_index) > 0: print('Found {} word vectors.'.format( len(self.word_embeddings_index))) def createWordEmbeddings(self): self.word_embeddings_index = {} f = open(self.word_embeddings_file, encoding="utf8") for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.word_embeddings_index[word] = coefs f.close() self.word_embeddings_matrix = np.zeros( (len(self.tokenizer.word_index) + 1, self.word_embeddings_dim)) for word, i in self.tokenizer.word_index.items(): embedding_vector = self.word_embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. self.word_embeddings_matrix[i] = embedding_vector self.word_embeddings_layer = Embedding( len(self.tokenizer.word_index) + 1, self.word_embeddings_dim, mask_zero=True, weights=[self.word_embeddings_matrix], trainable=True) writeWordEmbeddings(self.word_embeddings_layer, self.word_embeddings_index, self.data.languages, self.data.response_variable)
con = Constants() #Step 4: Get options and read all system arguments options = Options('System parameters', con) #Step 5: Read all custom arguments/options options.add(name='predict_languages', _type=str, _default='english', _help='specify which language you want to predict') #Step 6: Parse arguments options.parse() #Use random seed random.seed(options.args.random_seed) #Print system printer = Printer('System') printer.system(options.args_dict) #Step 7: Create data with default arguments data = Data(options.args.avoid_skewness, options.args.data_folder, options.args.predict_label, options.args.data_method #Step 8: Add all datasources and transform them to row(Y, X) format #Custom, should be self-made! #Step 8.1: Add the files or folders the data is preserved in (only if available) if options.args.predict_languages == 'english': data.file_train = 'eng-train.pickle' data.file_development = 'eng-trial.pickle' data.file_test = 'eng-test.pickle' else: data.file_train = 'es-train.pickle'
def classify(self, features, classifier=None): self.printer = Printer('Model Fitting', self.show_fitting) self.classifier.fit(self.X_train, self.Y_train) self.printer.duration()
class NeuralNetwork: word_embeddings_file = 'data/word_embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt' word_embeddings_dim = 200 word_embeddings_layer = None word_embeddings_index = {} labels = [] labels_dict = {} labels_dict_rev = {} Y = [] def __init__(self, data, predict_method, show_fitting): self.data = data self.X = self.data.X self.labels = self.data.labels for i, label in enumerate(self.labels): self.labels_dict[label] = i self.labels_dict_rev[i] = label self.Y = [] for label in self.data.Y: self.Y.append(self.labels_dict[label]) self.predict_method = predict_method self.show_fitting = show_fitting def tokenize(self): xy_section = [] xy_area = [] xy_element = [] xy = [] for x in self.X: xy_section.append(x['xy_section']) #xy_area.append(text_to_word_sequence('|'.join(x['xy_area']), split='|')) xy_area.append(x['xy_area']) xy_element.append(x['xy_element']) xy.append(x['xy']) #self.X_tokenized = TextTokenizer.tokenizeTweets(self.X) #all tweets! #print(xy_area) # self.X_tokenized = xy_area # vectorizer = TfidfVectorizer(tokenizer=TextTokenizer.tokenized, lowercase=False, analyzer='word', ngram_range=(1, 1), min_df=1) # self.X = vectorizer.fit_transform(self.X_tokenized) # self.input_length = len(vectorizer.get_feature_names()) #print(self.X) self.X = sequence.pad_sequences(xy) self.feature_length = len(self.X[0]) self.feature_dimensions = len(self.X[0][0]) print(self.feature_dimensions) self.input_length = len(self.X) self.Y = to_categorical(self.Y) def classify(self, features, classifier=None): self.tokenize() train_development_split = self.data.amount_train development_test_split = self.data.amount_train + self.data.amount_development self.X_train = self.X[:train_development_split] self.Y_train = self.Y[:train_development_split] self.X_development = self.X[ train_development_split:development_test_split] self.Y_development = self.Y[ train_development_split:development_test_split] self.X_test = self.X[development_test_split:] if self.data.avoid_skewness: Y_train = np.argmax(self.Y_train, axis=1) Y_train = [self.labels_dict_rev[int(i)] for i in list(Y_train)] self.X_train, self.Y_train = unskewedTrain(self.X_train, self.Y_train, Y_train) self.X_train = np.array(self.X_train) self.Y_train = np.array(self.Y_train) ##CHANGE OPTIONS HERE self.model = Sequential() self.model.add( Dense(512, input_shape=(self.feature_length, self.feature_dimensions))) self.model.add(Flatten()) self.model.add(Dense(6)) self.model.add(Activation('relu')) # self.model.add(Dropout(0.2)) # # self.model.add(Dense(128)) # self.model.add(Activation('relu')) # self.model.add(Dropout(0.1)) # self.model.add(Dense(6, input_dim=self.feature_length,))) # self.model.add(Activation('sigmoid')) self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Train the model self.printer = Printer('Model Fitting', self.show_fitting) self.model.fit(self.X_train, self.Y_train, epochs=50, batch_size=128, validation_split=0.2) self.printer.duration() def evaluate(self): self.Y_development_predicted = self.model.predict(self.X_development) self.Y_development_predicted = np.argmax(self.Y_development_predicted, axis=1) self.Y_development_predicted = [ self.labels_dict_rev[int(i)] for i in list(self.Y_development_predicted) ] self.Y_development = np.argmax(self.Y_development, axis=1) self.Y_development = [ self.labels_dict_rev[int(i)] for i in list(self.Y_development) ] self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics( self.Y_development, self.Y_development_predicted, self.labels) def printBasicEvaluation(self): self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Basic Evaluation") def printClassEvaluation(self): self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels) def printDataInformation(self): print('\n~~~Neural Network Distribution~~~\n') print('Found {} unique tokens.'.format(len(self.tokenizer.word_index))) print('Shape of data tensor: {}'.format(self.X.shape)) print('Shape of label tensor: {}\n'.format(self.Y.shape)) if len(self.word_embeddings_index) > 0: print('Found {} word vectors.'.format( len(self.word_embeddings_index))) def createWordEmbeddings(self): self.word_embeddings_index = {} f = open(self.word_embeddings_file, encoding="utf8") for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.word_embeddings_index[word] = coefs f.close() self.word_embeddings_matrix = np.zeros( (len(self.tokenizer.word_index) + 1, self.word_embeddings_dim)) for word, i in self.tokenizer.word_index.items(): embedding_vector = self.word_embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. self.word_embeddings_matrix[i] = embedding_vector self.word_embeddings_layer = Embedding( len(self.tokenizer.word_index) + 1, self.word_embeddings_dim, mask_zero=True, weights=[self.word_embeddings_matrix], trainable=True) writeWordEmbeddings(self.word_embeddings_layer, self.word_embeddings_index, self.data.languages, self.data.response_variable)
class KFoldValidation: accuracy = [] precision = [] recall = [] f1score = [] def __init__(self, k, method, data, features, predict_method, new_classifier, print_details, show_fitting): self.k = k self.kf = KFold(n_splits=self.k) self.print_details = print_details self.show_fitting = show_fitting self.printer = Printer(str(self.k) + '-Fold validation') self.method = method self.data = data self.features = features self.predict_method = predict_method self.new_classifier = new_classifier self.validation() def validation(self): i = 0 for train_index, test_index in self.kf.split( self.data.X[:(self.data.amount_train + self.data.amount_development)]): i += 1 if self.print_details >= 4: n_printer = Printer(str(self.k) + '-Fold, Run: ' + str(i)) X_train, X_development = list(np.array( self.data.X)[train_index]), list( np.array(self.data.X)[test_index]) Y_train, Y_development = list(np.array( self.data.Y)[train_index]), list( np.array(self.data.Y)[test_index]) self.data.initialize(X_train, Y_train, X_development, Y_development) classifier = selectClassifier(self.method, self.data, self.predict_method, self.show_fitting) classifier.classify(self.features, self.new_classifier) classifier.evaluate() self.accuracy.append(classifier.accuracy) self.precision.append(classifier.precision) self.recall.append(classifier.recall) self.f1score.append(classifier.f1score) if self.print_details >= 5: classifier.printBasicEvaluation() if self.print_details >= 6: classifier.printClassEvaluation() if self.print_details >= 7: n_printer.confusionMatrix(classifier.Y_development, classifier.Y_development_predicted, self.data.labels) # writeResults(options.args.predict_languages, classifier.Y_development, classifier.Y_development_predicted, 'development') if self.print_details >= 4: n_printer.duration() def printBasicEvaluation(self): self.printer.evaluation( avg(self.accuracy), avg(self.precision), avg(self.recall), avg(self.f1score), str(self.k) + "-Fold Cross Validation Evaluation") self.printer.duration()
class SVM: X_train = [] Y_train = [] X_development = [] Y_development = [] X_test = [] Y_predicted = [] labels = [] features = [] def __init__(self, data, show_fitting): self.X_train = data.X_train self.Y_train = data.Y_train self.X_development = data.X_development self.Y_development = data.Y_development self.X_test = data.X_test self.labels = data.labels self.show_fitting =show_fitting def classify(self, features, classifier=None): feature_union = ('feats', FeatureUnion( features )) if classifier == None: classifier = SGDClassifier(loss='hinge', random_state=42, max_iter=50, tol=None) self.classifier = Pipeline([ feature_union, ('classifier', classifier) ]) self.printer = Printer('Model Fitting', self.show_fitting) #self.X_train, X_none, self.Y_train, Y_none = train_test_split(self.X_train, self.Y_train, test_size=0.2, random_state=42) #self.printer.labelDistribution(self.Y_train, '80%') self.classifier.fit(self.X_train, self.Y_train) self.printer.duration() def evaluate(self): if self.X_development: self.Y_development_predicted = self.classifier.predict(self.X_development) if self.X_test: self.Y_test_predicted = self.classifier.predict(self.X_test) self.accuracy, self.precision, self.recall, self.f1score = metrics(self.Y_development, self.Y_development_predicted, self.labels) def printBasicEvaluation(self): self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Basic Evaluation") def printClassEvaluation(self): self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels)