def neural_network_classification(metrics): training_data, training_labels, test_data, test_labels, categories, mappings = preprocess( metrics) activation = "relu" model = Sequential() model.add( Dense(len(metrics) * 2, activation=activation, kernel_regularizer=regularizers.l2(0.1), input_shape=(len(metrics), ))) model.add( Dense(30, activation=activation, kernel_regularizer=regularizers.l2(0.1))) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='adam', loss="binary_crossentropy") model.fit(training_data, training_labels, epochs=30, batch_size=300, validation_data=(test_data, test_labels), verbose=0) data = np.concatenate((training_data, test_data)) labels = np.concatenate((training_labels, test_labels)) predictions = model.predict(data) predictions = np.squeeze(predictions, axis=1) return data, predictions, labels, categories, mappings
def infer(model, fnImg): "recognize text in image provided by file path" img = preprocess(cv2.imread(fnImg, cv2.IMREAD_GRAYSCALE), Model.imgSize) batch = BatchImages(None, [img]) (recognized, probability) = model.inferBatch(batch, True) print('Recognized:', '"' + recognized[0] + '"') print('Probability:', probability[0])
def neural_network_classification(metrics_): """This function takes as input a list of metrics and uses a neural network to perform the classification. :param metrics_: List containing the metrics that we want to use for classification.""" training_data, training_labels, test_data, test_labels, categories_, mappings_ = preprocess( metrics_) activation = "relu" model = Sequential() model.add( Dense(len(metrics) * 2, activation=activation, kernel_regularizer=regularizers.l2(0.1), input_shape=(len(metrics)))) model.add( Dense(30, activation=activation, kernel_regularizer=regularizers.l2(0.1))) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='adam', loss="binary_crossentropy") model.fit(training_data, training_labels, epochs=30, batch_size=300, validation_data=(test_data, test_labels), verbose=1) data_ = np.concatenate((training_data, test_data)) labels_ = np.concatenate((training_labels, test_labels)) predictions_ = model.predict(data_) predictions_ = np.squeeze(predictions_, axis=1) print("End") return data, predictions_, labels_, categories_, mappings_
def predict_SKLearn(self, vectoriser, model, text): # Predict the sentiment preprocessed_text = ' ' preprocessed, _ = preprocess([text]) preprocessed_text = ' '.join([str(word) for word in preprocessed]) preprocessed_text = [preprocessed_text] textdata = vectoriser.transform(preprocessed_text) prediction = model.predict(textdata) return prediction[0]
def nextBatch(self): rangeOfBatch = range(self.currIdx, self.currIdx + self.batchSize) trueText = [self.samples[i].trueText for i in rangeOfBatch] imgs = [ preprocess(cv2.imread(self.samples[i].filePath, cv2.IMREAD_GRAYSCALE), self.imgSize, self.dataAugmentation) for i in rangeOfBatch] self.currIdx += self.batchSize return BatchImages(trueText, imgs)
def predict(vectoriser, model, text): # Predict the sentiment preprocessed_text = ' ' preprocessed, _ = preprocess(text) preprocessed_text = ' '.join([str(word) for word in preprocessed[0]]) textdata = vectoriser.transform(word_tokenize(preprocessed_text)) prediction = model.predict(textdata) print(prediction[0])
def SVM_classification(metrics): training_data, training_labels, test_data, test_labels, categories, mappings = preprocess( metrics, recalculate=False, causal=False) np.random.seed(42) SVR = svm.LinearSVR(C=1.0 / float(len(test_data)), max_iter=5000) SVR.fit(training_data, training_labels) data = np.concatenate((training_data, test_data)) labels = np.concatenate((training_labels, test_labels)) predictions = SVR.predict(data) return data, predictions, labels, categories, mappings
def naive_bayes_classification(metrics): training_data, training_labels, test_data, test_labels, categories, mappings = preprocess( metrics) NBC = MultinomialNB() NBC.fit(training_data, training_labels) data = np.concatenate((training_data, test_data)) labels = np.concatenate((training_labels, test_labels)) class_predictions = NBC.predict_proba(data) predictions = [] for i in range(len(labels)): predictions.append(class_predictions[i][1]) return data, predictions, labels, categories, mappings
def SVM_classification(metrics_): """This function takes as input a list of metrics and uses a Support vector machine to perform the classification. :param metrics_: List containing the metrics that we want to use for classification.""" training_data, training_labels, test_data, test_labels, categories, mappings = \ preprocess(metrics_, recalculate=False, causal=False) np.random.seed(42) SVR = svm.LinearSVR(C=1.0 / float(len(test_data)), max_iter=5000) SVR.fit(training_data, training_labels) data = np.concatenate((training_data, test_data)) labels = np.concatenate((training_labels, test_labels)) predictions = SVR.predict(data) return data, predictions, labels, categories, mappings
def naive_bayes_classification(metrics): """This function takes as input a list of metrics and performs the Naive Bayes classification. :param metrics: List containing the metrics that we want to use for classification.""" training_data, training_labels, test_data, test_labels, categories, mappings = preprocess( metrics) NBC = MultinomialNB() NBC.fit(training_data, training_labels) data = np.concatenate((training_data, test_data)) labels = np.concatenate((training_labels, test_labels)) class_predictions = NBC.predict_proba(data) predictions = [] for i in range(len(labels)): predictions.append(class_predictions[i][1]) return data, predictions, labels, categories, mappings
from keras.utils import np_utils from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras import optimizers from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix,accuracy_score import plotly.graph_objects as go from keras.callbacks import ModelCheckpoint from sklearn.model_selection import StratifiedKFold import matplotlib.pyplot as plt # In[ ]: # Import preprocess files preprocessing = preprocess() # In[ ]: # Preprocess data to generate the data def generateWordList(fileData): # Generate list of preprocessed words from file finalSet = preprocessing.checkEmail(fileData) # Check emails in the file and add them to wordlist fileData = preprocessing.removeEmail(fileData) # Remove emails finalSet = finalSet + preprocessing.checkWebsite(fileData) # Check for website and decimal number and add them to wordlist fileData = preprocessing.removeWebsite(fileData) # Remove webiste and decimal numbers fileData = preprocessing.contractionsExpand(fileData) # Expand the contracted word fileData = preprocessing.caseChange(fileData) # Change case of file to lower case fileData = preprocessing.removePunctuations(fileData) # Remove punctutation wordList = preprocessing.wordSeperator(fileData) # Seperate words by space
from sklearn import svm from Preprocessing import preprocess from Postprocessing import * from utils import * metrics = [ "race", "sex", "age", 'c_charge_degree', 'priors_count', 'c_charge_desc' ] training_data, training_labels, test_data, test_labels, categories, mappings = preprocess( metrics) SVR = svm.LinearSVR(C=1.0 / float(len(test_data)), max_iter=10000) SVR.fit(training_data, training_labels) training_class_predictions = SVR.predict(training_data) training_predictions = [] test_class_predictions = SVR.predict(test_data) test_predictions = [] for i in range(len(training_labels)): training_predictions.append(training_class_predictions[i]) for i in range(len(test_labels)): test_predictions.append(test_class_predictions[i]) training_race_cases = get_cases_by_metric(training_data, categories, "race", mappings, training_predictions, training_labels) test_race_cases = get_cases_by_metric(test_data, categories, "race", mappings, test_predictions, test_labels)
from sklearn import svm from Preprocessing import preprocess from Report_Results import * import numpy as np from utils import * metrics = ["sex", "age", 'race', 'c_charge_degree', 'priors_count', 'c_charge_desc'] # age_cat training_data, training_labels, test_data, test_labels, categories, mappings = preprocess(metrics, recalculate=False, causal=False) def SVM_classification(metrics): np.random.seed(42) SVR = svm.LinearSVR(C=1.0/float(len(test_data)), max_iter=5000) # 5600 SVR.fit(training_data, training_labels) #data = np.concatenate((training_data, test_data)) #labels = np.concatenate((training_labels, test_labels)) training_predictions = SVR.predict(training_data) # data testing_predictions = SVR.predict(test_data) return training_data, test_data, training_predictions, testing_predictions, training_labels, categories, mappings # labels ####################################################################################################################### training_data, test_data, training_predictions, testing_predictions, labels, categories, mappings = SVM_classification(metrics) training_race_cases = get_cases_by_metric(training_data, categories, "race", mappings, training_predictions, training_labels) test_race_cases = get_cases_by_metric(test_data, categories, "race", mappings, testing_predictions, test_labels)
import pandas from DB import db from Preprocessing import preprocess count = 0 db = db() allTweets = db.getAll() hashtagDatetimes = [] for tweet in allTweets: # Hashtags list terms_hash = [ term for term in preprocess(tweet['text'].lower()) if term.startswith('#') ] if '#kathygriffin' in terms_hash: hashtagDatetimes.append(tweet['created_at']) # if count == 60000: # break count += 1 print("\rLive number of processed tweets: " + str(count), end="") print("\n") print("length of occurence array : " + str(len(hashtagDatetimes))) # a list of "1" to count the hashtags ones = [1] * len(hashtagDatetimes) # the index of the series idx = pandas.DatetimeIndex(hashtagDatetimes)
from Preprocessing import preprocess, bigrams, trigrams # --- IMPORTING DATASET --- # Using Sentiment140 dataset with 1.6 million tweets # https://www.kaggle.com/kazanova/sentiment140/data DATASET_COLUMNS = names = ['target', 'ids', 'date', 'flag', 'user', 'text'] tweets_raw = pd.read_csv(r'.\dataset\tweets.csv', sep = ',', quotechar ='"',encoding='latin-1', names=DATASET_COLUMNS) tweets_raw['sentiment'] = tweets_raw['sentiment'].replace(4,'pos') tweets_raw['sentiment'] = tweets_raw['sentiment'].replace(0,'neg') tweets_raw, label = list(tweets_raw['text']), list(tweets_raw['sentiment']) neg_tweets_raw = tweets_raw[:799999] pos_tweets_raw = tweets_raw[800000:] t = time.time() pos_tweets, neg_words_ngrams = preprocess(pos_tweets_raw) neg_tweets, pos_words_ngrams = preprocess(neg_tweets_raw) print(f'Text Preprocessing complete.') print(f'Time Taken: {round(time.time()-t)} seconds') while len(neg_tweets) > len(pos_tweets): del neg_tweets[0] del label[-1] while len(pos_tweets) > len(neg_tweets): del pos_tweets[0] del label[0] t = time.time() bgram = bigrams(neg_words_ngrams,stopwords.words('english')) tgram = trigrams(neg_words_ngrams,stopwords.words('english')) print(f'Ngrams processing complete.')
from LSTM import lstm_model from Ploting import plot from Preprocessing import preprocess from Trading_Algorithm import trading_algorithm, compute_earnings csv_path = 'data/tickers_data/femeli-daily.csv' history_points = 50 x_train, y_train, x_test, y_test, x_val, y_val, y_test_real, scale_back, tech_ind_train, tech_ind_test, tech_ind_val = preprocess( csv_path, history_points) y_test_predicted, model = lstm_model(history_points, x_train, y_train, x_test, y_test, x_val, y_val, y_test_real, scale_back, tech_ind_train, tech_ind_test, tech_ind_val) buys, sells = trading_algorithm(x_test, tech_ind_test, scale_back, model) plot(y_test_real, y_test_predicted, buys, sells) purchase_amt = 1000000000 # 100 million Toman print( "{} Rials after trading over trading days of the test data will make profit {} Rials" .format(purchase_amt, compute_earnings(buys, sells, purchase_amt) - purchase_amt))
from DB import db from Preprocessing import preprocess, stop count = 0 db = db() allTweets = db.getAll() count_all_hashtags = Counter() count_all_terms = Counter() dates_hashtag = [] for tweet in allTweets: tweetText = tweet['text'].lower() # Bigrams list termsWithoutStopwords = [ term for term in preprocess(tweetText) if term not in stop ] # termsBigrams = bigrams(termsWithoutStopwords) # Hashtags list terms_hash = [ term for term in preprocess(tweetText) if term.startswith('#') ] if '#marchfortruth' in terms_hash: dates_hashtag.append(tweet['created_at']) # Update the counter(s) count_all_terms.update(termsWithoutStopwords) count_all_hashtags.update(terms_hash) count += 1