def LinearSVC_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with LinearSVC...") linear_svc = LinearSVC() linear_svc.fit(train, train_labels) prediction = linear_svc.predict(test) utils.report_and_confmat(test_labels, prediction, "LinearSVC") score = linear_svc.score(test, test_labels) res["LinearSVC"] = { "model": linear_svc, "accuracy": score, "name": "LinearSVC" } print("LinearSVC ended...") return score, linear_svc
def MultinomialNB_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ multiNB = MultinomialNB() multiNB.fit(train, train_labels) prediction = multiNB.predict(test) utils.report_and_confmat(test_labels, prediction, "MultinomialNB") score = multiNB.score(test, test_labels) res["MultinomialNB"] = { "model": multiNB, "accuracy": score, "name": "MultinomialNB" } print("Multinomial ended...") return score, multiNB
def ExtrExtraTrees_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with ExtraTrees...") extra = ExtraTreesClassifier() extra.fit(train, train_labels) prediction = extra.predict(test) utils.report_and_confmat(test_labels, prediction, "ExtraTrees") score = extra.score(test, test_labels) res["ExtraTrees"] = { "model": extra, "accuracy": score, "name": "ExtraTreesClassifier" } print("ExtraTrees ended...") return score, extra
def AdaBoost_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with AdaBoost...") # Uso l'svc perché è quello che funziona meglio per ora Linsvc = LinearSVC() adab = AdaBoostClassifier(base_estimator=Linsvc, algorithm='SAMME', n_estimators=50) adab.fit(train, train_labels) prediction = adab.predict(test) utils.report_and_confmat(test_labels, prediction, "AdaBoost") score = adab.score(test, test_labels) print("Adaboost ended...") res["AdaBoostClassifier"] = { "model": adab, "accuracy": score, "name": "AdaBoostClassifier" } return score, adab
def LogisticRegression_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with LogisticRegression...") # TODO CONTROLLARE I SOLVER DIVERSI reg = LogisticRegression(max_iter=250, multi_class='multinomial', solver='newton-cg') reg.fit(train, train_labels) prediction = reg.predict(test) utils.report_and_confmat(test_labels, prediction, "LogisticReg") score = reg.score(test, test_labels) res["LogisticRegression"] = { "model": reg, "accuracy": score, "name": "LogisticRegression" } print("Logistic Regression ended...") return score, reg
def random_forest_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data iterable/list :param train_labels: training labels :param test_labels: testing labels :return: / --> Saves data in folder "Results" """ print("Classifying with Random Forest Classifier...") rand = RandomForestClassifier(n_estimators=70, max_depth=None) rand.fit(train, train_labels) prediction = rand.predict(test) utils.report_and_confmat(test_labels, prediction, "Random Forest") score = rand.score(test, test_labels) res["RandomForestClassifier"] = { "model": rand, "accuracy": score, "name": "RandomForestClassifier" } print("RandomForset ended...") return score, rand
def GradientBoosting_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with Gradient Boosting...") gradb = GradientBoostingClassifier(n_estimators=100) gradb.fit(train, train_labels) prediction = gradb.predict(test) utils.report_and_confmat(test_labels, prediction, "GradientBoosting") score = gradb.score(test, test_labels) res["GradientBoostingClassifier"] = { "model": gradb, "accuracy": score, "name": "GradientBoostingClassifier" } print("GradientBoosting ended...") return score, gradb
def BernoulliNB_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with Bernoulli Nive Bayes...") bernNB = BernoulliNB(alpha=0.7) bernNB.fit(train, train_labels) prediction = bernNB.predict(test) utils.report_and_confmat(test_labels, prediction, "BernoulliNB") score = bernNB.score(test, test_labels) res["BernoulliNB"] = { "model": bernNB, "accuracy": score, "name": "BernoulliNB" } print("Bernoulli ended...") return score, bernNB
def ComplementNB_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with Complement Nive Bayes...") complNB = ComplementNB() complNB.fit(train, train_labels) prediction = complNB.predict(test) utils.report_and_confmat(test_labels, prediction, "ComplementNB") score = complNB.score(test, test_labels) res["ComplementNB"] = { "model": complNB, "accuracy": score, "name": "ComplementNB" } print("Complement ended...") return score, complNB
def lstm_classification(train, valid, labels_train, labels_valid, save_path, num_classes, num_epochs=10): train_lab = labels_for_NN(labels_train) EMBEDDING_DIM = 300 MAX_SEQUENCE_LENGTH = 750 embedding_matrix, vocab, train_we, test_we = create_embedding(train, valid) VOCAB_SIZE = len(vocab) model = Sequential() model.add( Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, weights=[embedding_matrix])) model.add(LSTM(512)) model.add(Dense(100, activation='sigmoid')) model.add(Dense(num_classes, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) ## Fit train data history = model.fit(train_we, np.array(train_lab), validation_split=0.2, epochs=num_epochs, batch_size=batch_size) utils.plot_history(history) # SE LA MATRICE TFIDF NON VA BENE O I BAG OF WORDS NON VANNO BENE ALLORA USO QUESTO # tokenizer = Tokenizer(num_words=VOCAB_SIZE) # sequences = tokenizer.texts_to_sequences(valid) # data_test = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # list_prediction_proba = model.predict(data_test) list_prediction_proba = model.predict(test_we) predizione = [ np.where(probabilities == probabilities.max())[0].min() for probabilities in list_prediction_proba ] utils.report_and_confmat(labels_train, labels_valid, predizione, save_path, "TINY_lstm_" + str(EMBEDDING_DIM))
def conv_classification(train, valid, labels_train, labels_valid, save_path, num_classes, num_epochs=10): train_lab = labels_for_NN(labels_train) EMBEDDING_DIM = 300 MAX_SEQUENCE_LENGTH = 750 embedding_matrix, vocab, train_we, test_we = create_embedding(train, valid) VOCAB_SIZE = len(vocab) model = Sequential() model.add( Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, weights=[embedding_matrix])) model.add(Dropout(0.2)) model.add(Conv1D(512, 7, activation='relu')) model.add(GlobalMaxPooling1D()) model.add(Dense(100, activation='relu')) model.add(Dense(num_classes, activation='softmax')) # NB binary classification -->binary_crossentropy, Multi-class classification --> categorical_crossentropy model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(train_we, np.array(train_lab), validation_split=0.2, epochs=num_epochs, batch_size=batch_size) utils.plot_history(history) list_prediction_proba = model.predict(test_we) predizione = [ np.where(probabilities == probabilities.max())[0].min() for probabilities in list_prediction_proba ] utils.report_and_confmat(labels_train, labels_valid, predizione, save_path, "TINY_conv_1_layer" + str(EMBEDDING_DIM))
def bi_lstm_classification(train, valid, labels_train, labels_valid, save_path, num_classes, num_epochs=10): train_lab = labels_for_NN(labels_train) EMBEDDING_DIM = 300 MAX_SEQUENCE_LENGTH = 750 embedding_matrix, vocab, train_we, test_we = create_embedding(train, valid) VOCAB_SIZE = len(vocab) model = Sequential() model.add( Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, weights=[embedding_matrix])) model.add(Bidirectional(LSTM(512, return_sequences=False))) model.add(Dropout(0.2)) model.add(Dense(100, activation='sigmoid')) model.add(Dense(num_classes, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) ## Fit train data history = model.fit(train_we, np.array(train_lab), validation_split=0.2, epochs=num_epochs, batch_size=batch_size) utils.plot_history(history) list_prediction_proba = model.predict(test_we) predizione = [ np.where(probabilities == probabilities.max())[0].min() for probabilities in list_prediction_proba ] utils.report_and_confmat(labels_train, labels_valid, predizione, save_path, "TINY_bilstm" + str(EMBEDDING_DIM))
def VotingClassifier_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with Voting classifier...") cl1 = LogisticRegression(max_iter=250, multi_class='auto') cl6 = MultinomialNB() cl3 = AdaBoostClassifier(base_estimator=cl1, algorithm='SAMME', n_estimators=150) cl4 = GradientBoostingClassifier() cl5 = ComplementNB() cl8 = RandomForestClassifier(n_estimators=70, max_depth=None) cl9 = ExtraTreesClassifier() vote = VotingClassifier(estimators=[('LogisticReg', cl1), ('AdaBoost', cl3), ('GradBoost', cl4), ('ComplementNB', cl5), ('MultinomialNB', cl6), ('RandomForest', cl8), ('ExtraTree', cl9)], voting='soft') vote.fit(train, train_labels) prediction = vote.predict(test) utils.report_and_confmat(test_labels, prediction, "VotingClass") score = vote.score(test, test_labels) print("Voting ended...") res["VotingClassifier"] = { "model": vote, "accuracy": score, "name": "VotingClassifier" } return score, vote
def SVC_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with SVC...") svc = SVC(kernel='poly', gamma='scale') svc.fit(train, train_labels) prediction = svc.predict(test) utils.report_and_confmat(test_labels, prediction, "SVC") score = svc.score(test, test_labels) res["SVC"] = {"model": svc, "accuracy": score, "name": "SVC"} print("SVC ended...") return score, svc
def word2vec_classification(train, valid, labels_train, labels_valid, save_path, num_classes, num_epochs=10): """ Calls all the classifiers functions in order to choose and save the best one. :param train: training data, iterable/list :param valid: testing data, iterable/list :param label_train: training labels, iterable/list :param label_test: testing labels, iterable/list :param num_classes: number of classes in training data, integer :param num_epochs=10: number of epochs to perform, integer :param save_path: (fixed to Models directory) :return: / """ train_lab = utils.labels_for_NN(labels_train) train_clean = train train_tokens = [] list_tot = [] for sentence in train_clean: train_tokens.append(sentence.split()) test_tokens = [] for sentence in valid: test_tokens.append(sentence.split()) # Dimension of the embedding vector representing the words EMBEDDING_DIM = 300 # USING GENSIM, it needs a list of training TOKENS and it builds the vocabulary model = word2vec.Word2Vec(train_tokens, iter=10, min_count=10, size=EMBEDDING_DIM, workers=4) VOCAB_SIZE = len(model.wv.vocab) MAX_SEQUENCE_LENGTH = 750 # Compute training embedding train_sequences = utils.convert_data_to_index(train_tokens, model.wv) test_sequences = utils.convert_data_to_index(test_tokens, model.wv) # Pad the vectors so they're all the same length train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post") test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post") # Getting the embedding matrix, a lookup table that translates a known word into a vector embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM)) for i in range(len(model.wv.vocab)): embedding_vector = model.wv[model.wv.index2word[i]] if embedding_vector is not None: embedding_matrix[i] = embedding_vector # Build a network: made out of first convolutional part and second recurrent part (LSTM) # NB Thenetwork is very small and basic because of strict system requirements model = Sequential() model.add( Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, weights=[embedding_matrix])) model.add(Conv1D(512, 5, activation='sigmoid')) model.add(GlobalMaxPooling1D()) model.add(Bidirectional(LSTM(600, return_sequences=False))) model.add(Dense(100, activation='sigmoid')) model.add(Dense(num_classes, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.002, clipnorm=.25, beta_1=0.7, beta_2=0.99), metrics=['acc']) # Train the nwtwork model.fit(train_data, train_lab, validation_split=0.2, epochs=num_epochs, batch_size=80) # Make predictions list_prediction_proba = model.predict(test_data) # Compute report and confusion matrix predizione = [ np.where(probabilities == probabilities.max())[0].min() for probabilities in list_prediction_proba ] utils.report_and_confmat(labels_train, labels_valid, predizione, save_path, "word2vec_")