def predict(hours): series = pp.get_data() X = series.values history = [x for x in X] hours_in_week = 168 validation = pp.get_validate() y = validation.values model_fit = ARIMAResults.load('model.pkl') predictions = list() yhat = model_fit.forecast()[0] yhat = inverse_difference(history, yhat, hours_in_week) predictions.append(yhat) history.append(yhat) for i in range(1, hours): diff = difference(history, hours_in_week) model = ARIMA(diff, order=(1, 0, 0)) model_fit = model.fit(trend='nc', disp=0) yhat = model_fit.forecast()[0] yhat = inverse_difference(history, yhat, hours_in_week) history.append(yhat) predictions.append(yhat) return predictions
def preprocess(device, channels): device_path = os.path.join(_data_path, device) print('Processing data for device:', device) print('Number of channels:', len(channels)) if (len(channels) == 0): print('Invalid channel selection, Skipping') return for i in range(1, 25): case_num = '0' + str(i) if i < 10 else str(i) case = f'chb{case_num}' file_path = os.path.join(device_path, f'{case}.hdf') if os.path.isfile(file_path): print('File already exists:', file_path) print('Skipping case') continue print('Processing case:', case) df = preprocessing.get_data(case_num, channels=channels) print('Resulting table:') print(df) print('Writing to disk:', case) print('File path:', file_path) df.to_hdf(file_path, 'df')
def main(): # load BioBERT from Hugging Face file_name = "giacomomiolo/biobert_reupload" impressions, labels = get_data() biobert = BioBERT(file_name, impressions, labels) # get train and test data train_data, test_data = biobert.tokenize_and_split_data() model = MSNR(impressions, labels, biobert) model.layers[ 0].trainable = False # freeze BioBERT layer to only train our classifier epoch_accuracy = [] per_class_epoch_accuracy = [] for i in range(model.epochs): train(model, train_data[0], train_data[1], train_data[2]) print("epoch:", i, "/ 19") # print accuracies train_accuracy = model.cat_acc.result().numpy() print("Keras Categorical Accuracy (train)", train_accuracy) results = test(model, test_data[0], test_data[1], test_data[2]) print("per class accuracy:", results[1]) print("# of examples per class:", results[2]) test_accuracy = model.cat_acc.result().numpy() print("Keras Categorical Accuracy (test)", test_accuracy)
def train(): strategy = tf.distribute.MirroredStrategy() (x_train, y_train), (x_test, y_test) = get_data() with open('DataLoading.txt', 'a+') as f: App_Logger.log(f, 'Loaded data successfully...') callbacks = [keras.callbacks.TensorBoard(log_dir='./logs'), keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=1), keras.callbacks.ReduceLROnPlateau(monitor='accuracy', factor=0.01, verbose=1)] try: with strategy.scope(): K.clear_session() myModel = model.create_model() myModel.compile(loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(), metrics=['accuracy']) with open('DataLoading.txt', 'a+') as f: App_Logger.log(f, 'Created and compiled model....\n' + myModel.summary()) history = myModel.fit(x_train, y_train, validation_split=0.25, callbacks=callbacks, verbose=1) with open('train.txt', 'a+') as f: App_Logger.log(f, 'Training successful ' + history.history) except Exception as e: with open('Error.txt', 'a+') as f: App_Logger.log(f, e)
def submit(model, do_ensemble=False, num_ensembles=3): test_X = get_data(training=False) print(test_X.shape) # plt.imshow(test_X[32].reshape((28, 28))) # plt.show() if not do_ensemble: results = list(map(lambda pred: np.argmax(pred), model.predict(test_X))) results = np.array(results) print(results.shape) print(results[:10]) submission = pd.DataFrame({'Label': results}, list(range(1, 28001))) print(submission.head()) submission.to_csv('submissions/submission.csv', index_label='ImageId') if do_ensemble: models = ensemble(num_ensembles, 30, 2000) results = np.zeros((test_X.shape[0], 10)) for i in range(len(models)): results = results + models[i].predict(test_X) results = np.argmax(results, axis=1) print(results.shape) print(results[:10]) submission = pd.DataFrame({'Label': results}, list(range(1, 28001))) # submission.to_csv('submissions/ensemble_prediction.csv', index_label='ImageID') submission.to_csv('ensemble_prediction.csv', index_label='ImageID')
def get_prediction_accuracy(params): pred = pp.get_prediction( params, network.S_PATH + params['name'] + '_predictions.txt') _, Y = pp.get_data(params, params['dset_U']) if pred is not None and Y is not None: pred, Y = pp.get_tensor(pred, Y) acc = get_accuracy(pred, Y) log("Predicted Accuracy: %f." % (acc), name=params['log_name'])
def main(): ''' Read in MNIST data, initialize your model, and train and test your model for one epoch. The number of training steps should be your the number of batches you run through in a single epoch. You should receive a final accuracy on the testing examples of > 80%. :return: None ''' # TODO: load MNIST train and test examples into train_inputs, train_labels, test_inputs, test_labels fr, km = get_data('COS071212_MOCAP.mat') indices = tf.range(0, len(fr)) tf.random.shuffle(indices) fr = tf.gather(fr, indices) km = tf.gather(km, indices) eighty_p = int(len(fr) * 0.8) train_inp = fr[:eighty_p] train_lab = km[:eighty_p] test_inp = fr[eighty_p:] test_lab = km[eighty_p:] # TODO: Create Model model = Model(29) # TODO: Train model by calling train() ONCE on all data results = 0 final_results = 0 num_epochs = 200 loss_list = [] for i in range(num_epochs): print("EPOCH: ", i) indices = tf.range(0, len(train_inp)) tf.random.shuffle(indices) train_inp = tf.gather(train_inp, indices) train_lab = tf.gather(train_lab, indices) print("training") train(model, train_inp, train_lab) # TODO: Test the accuracy by calling test() after running train() print("testing") results = test(model, test_inp, test_lab) loss_list.append(results) print("results: ", results) final_results += results epoch_list = tf.range(0, num_epochs) plt.xlabel('Epoch') plt.ylabel('Loss per Epoch') plt.title('Loss Between Predicted and Actual Kinematic Positions') plt.plot(epoch_list, loss_list) plt.show() print("final_results: ", final_results / num_epochs)
def main(): # import model and data X_train, X_test, y_train, y_test = get_data() model = get_model(data=(X_train, y_train)) # evaluate it test_lost, test_accuracy = model.evaluate(X_test, y_test) print(f"Test accuracy: {test_accuracy:.2f}") return model
def get_model(data=None): """ Returns a model designed to work with the fashion_mnist dataset """ # get training_data if data == None: data = get_data() # get X_train and y_train from data X_train = data[0] y_train = data[1] del data # Initialise model model = tf.keras.models.Sequential() # Add first fully-connected hidden layer # Fully-connected means all nodes are connected model.add( tf.keras.layers.Dense( units=128, # number of neurons activation='relu', # ReLU function input_shape=(X_train.shape[1], ) # number of pixels = 28*28 )) # Add second layer with dropout # Dropout layer means some nodes are not updated during # back-propagation model.add(tf.keras.layers.Dropout(0.2)) # EXPERIMENTAL - additional layer model.add(tf.keras.layers.Dropout(0.4)) # Add output layer, activated using softmax model.add( tf.keras.layers.Dense( units=10, # number of classes in the dataset (i.e 0-9) activation='softmax')) # compile the model model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) # get a summary model.summary() # train the model model.fit(X_train, y_train, epochs=10) return model
def main(): # path to training dataset train_path = "liar_dataset/train.tsv" test_path = "liar_dataset/test.tsv" valid_path = "liar_dataset/valid.tsv" data_train, data_test, labels_train, labels_test, subjects_train, subjects_test, word_index, unique_events = \ get_data(train_path, test_path, valid_path, verbose=True) embedding_matrix = text_extract(word_index) extractor = Extractor_Model(embedding_matrix) detector = Detector_Model() discriminator = Discriminator_Model(unique_events) train(extractor, detector, discriminator, data_train, labels_train, subjects_train) test(extractor, detector, discriminator, data_test, labels_test, subjects_test)
def main( model, time_stamp, expl_var, expt, ): X_train, X_valid,\ y_train, y_valid = get_data(expt) pca = PCABasic(expl_var) pca.train(X_train.reshape(cfg.num_trains[expt], -1)) sep() logging.info('\nExplained Variance: {}\nNum Components: {}'.format( str(expl_var), pca.num_components, )) model_ckpt = 'ckpts/{}/models/{}_{}.pkl'.format(expt, model, marker) sep() logging.info('Saving: {}'.format(model_ckpt)) joblib.dump(pca, model_ckpt)
from statsmodels.tsa.arima_model import ARIMAResults import preprocessing as pp from arima import inverse_difference series = pp.get_data() hours_in_week = 168 model_fit = ARIMAResults.load('model.pkl') yhat = (model_fit.forecast()[0]) yhat = inverse_difference(series.values, yhat, hours_in_week) print("Predicted: %d" % yhat) validate = pp.get_validate() print(validate[0])
def get_accuracy(y_hat, y_pred): ''' returns the accuracy ''' print(y_hat.shape) return 100 * np.sum( np.array(y_pred) == np.argmax(y_hat, axis=1)) / len(y_pred) if __name__ == '__main__': # 1. Read in images (training and testing) and corresponding labels X, X_test, y, y_test = get_data( folder_training="data/GTSRB/Final_Training/Images/", folder_testing="data/GTSRB/Final_Test/Images/") # 2. Split into training and validation set X_train, X_val, y_train, y_val = split_train_validation(X, y) # 3. Create model and optimizer model = create_model() # 4. Train model with cross entropy and cross validation trained_model = train_model(model, X_train, X_val, y_train, y_val) # 5. Run test set on model y_pred = predict_labels(trained_model, X_test) print(get_accuracy(y_test, y_pred)) # 6. Print out results (Confusion matrix, accuracy, training, validation and testing error)
from sklearn.preprocessing import StandardScaler from keras.layers import Input, Dense from keras.models import Model from keras.callbacks import TensorBoard import tensorflow as tf from keras import regularizers, optimizers, backend as K from sklearn.manifold import TSNE from matplotlib import pyplot as plt import warnings warnings.filterwarnings('ignore') from preprocessing import get_data import Utils train, test = get_data(encoding="Hash_encoder") train_label = train.label.values train.drop(["label"], axis=1, inplace=True) #0 : normal, 1 : anomal Scaler = StandardScaler() train = Scaler.fit_transform(train.values)[np.where(train_label == 0)] test, ytest = Scaler.transform(test.drop(["label"], axis=1)), test.label.values #AUTOENCODER def fit_model(X, lr=0.001, l2=0.001, ep=100, bs=50): input_dim = X.shape[1] latent_space_size = 15
def train_C(params): # ------------------- # Parameters # ------------------- log(str(params), name=params['log_name']) # # Clear remaining model # network.clear(params['name']+'_R'+str(params['start_run'])) # ------------------- # CUDA # ------------------- cuda = True if torch.cuda.is_available() else False C_Loss = torch.nn.BCELoss() if cuda: C_Loss.cuda() floatTensor = torch.cuda.FloatTensor log("CUDA Training.", name=params['log_name']) else: floatTensor = torch.FloatTensor log("CPU Training.", name=params['log_name']) # ------------------- # Data scaling # ------------------- ''' XTL ... Training data labelled XTU ... Training data unlabelled XL ... Labelled data XU ... Unlabelled data XV ... Validation data ''' dset_L = params['dset_L'] dset_V = params['dset_V'] XTL, YTL = pp.get_data(params, dset_L) XV, YV = pp.get_data(params, dset_V) XTL = pp.scale_minmax(XTL) XV = pp.scale_minmax(XV) if params['ratio_V'] < 1.0: XV, YV = pp.select_random(XV, YV, params['ratio_L']) log("Selected %s of validation samples." % (format(params['ratio_V'], '0.2f')), name=params['log_name']) XV, YV = pp.get_tensor(XV, YV) # ------------------- # Load accuracy # ------------------- mat_accuracy_C = network.load_R_Acc(params) # ------------------- # Start Training # ------------------- YF = None PF = None for run in range(params['runs']): # ------------------- # Training Data # ------------------- XL, YL = XTL, YTL if params['ratio_L'] < 1.0: XL, YL = pp.select_random(XL, YL, params['ratio_L']) log("Selected %s of labelled samples." % (format(params['ratio_L'], '0.2f')), name=params['log_name']) count_L = YL.shape[0] log("Number of labelled samples = %d." % (count_L), name=params['log_name']) dataloader = pp.get_dataloader(params, XL, YL) C = network.load_Ref(run, params) # ------------------- # Optimizers # ------------------- optimizer_C = torch.optim.Adam(C.parameters(), lr=params['CLR'], betas=(params['CB1'], params['CB2'])) # ------------------- # Training # ------------------- if run >= params['start_run']: if params['oversampling']: XL, YL = pp.over_sampling(params, XL, YL) log("Oversampling: created %d new labelled samples." % (XL.shape[0] - count_L), name=params['log_name']) for epoch in range(params['epochs']): # Jump to start epoch if run == params['start_run']: if epoch < params['start_epoch']: continue running_loss_C = 0.0 for i, data in enumerate(dataloader, 1): loss_C = [] # ------------------- # Train the classifier on real samples # ------------------- X1, Y1 = data optimizer_C.zero_grad() P1 = C(X1) loss = C_Loss(P1, Y1) loss_C.append(loss) loss.backward() optimizer_C.step() # ------------------- # Calculate overall loss # ------------------- running_loss_C += np.mean([loss.item() for loss in loss_C]) # ------------------- # Post Epoch # ------------------- logString = "[Run %d/%d] [Epoch %d/%d] [C loss: %f]" % ( run + 1, params['runs'], epoch + 1, params['epochs'], running_loss_C / (i)) log(logString, save=False, name=params['log_name']) if (epoch + 1) % params['save_step'] == 0: # log("~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~|",save=False,name=params['log_name']) idx = run, int(epoch / params['save_step']) + 1 # Predict labels PV = C(XV) acc_C_real = get_accuracy(PV, YV) mat_accuracy_C[idx] = acc_C_real logString = "[Run %d/%d] [Epoch %d/%d] [C acc: %f ]" % ( run + 1, params['runs'], epoch + 1, params['epochs'], acc_C_real) log(logString, save=True, name=params['log_name']) network.save_Ref(params['name'], run, C) network.save_R_Acc(params, mat_accuracy_C) params['start_epoch'] = epoch + 1 network.save_Parameter(params) # log("~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~|",save=False,name=params['log_name']) # End of Training Run params['start_run'] = run + 1 params['start_epoch'] = 0 network.save_Parameter(params) # ------------------- # Post Run # ------------------- # Classify Validation data PC = C(XV).detach() if YF == None: YF = YV PF = PC else: YF = torch.cat((YF, YV), 0) PF = torch.cat((PF, PC), 0) # ------------------- # Post Training # ------------------- timeline = np.arange(0, params['epochs'] + 1, params['save_step']) # ------------------- # Plot Accuracy # ------------------- acc_C = np.mean(mat_accuracy_C, axis=0) fig, ax = plt.subplots() legend = [] cmap = plt.get_cmap('gnuplot') indices = np.linspace(0, cmap.N, 7) colors = [cmap(int(i)) for i in indices] ax.plot(timeline, acc_C, c=colors[0], linestyle='solid') legend.append("Accuracy $A_C$") ax.set_xlim(0.0, params['epochs']) ax.set_ylim(0.0, 1.0) ax.legend(legend) ax.set_xlabel('Epoch') ax.set_ylabel('Accuracy') ax.grid() save_fig(params, 'eval', fig) # ------------------- # Generate Confusion Matrix # ------------------- YF = pp.one_hot_to_labels(params, YF) PF = pp.one_hot_to_labels(params, PF) con_mat = confusion_matrix(YF, PF, labels=None, sample_weight=None, normalize='true') plot_confusion_matrix(con_mat, params, name='C', title='Confusion matrix') # ------------------- # Log Results # ------------------- log(" - " + params['name'] + ": [C acc: %f]" % (acc_C[-1]), name='results')
# Pre-processing a document. def preprocess_gensim(doc): """ preprocess raw text by tokenising and removing stop-words,special-charaters """ doc = doc.lower() # Lower the text. doc = word_tokenize(doc) # Split into words. doc = [w for w in doc if not w in stop_words] # Remove stopwords. doc = [w for w in doc if w.isalpha()] # Remove numbers and punctuation. return doc # Train a word2vec model with default vector size of 100 def train_word2vec(train_data,worker_no=3, vector_size=100,model_name="word2vec_model"): """ Trains a word2vec model on the preprocessed data and saves it . """ if not train_data: print "no training data" return w2v_corpus = [preprocess_gensim(train_data[i]) for i in range(len(train_data))] model = Word2Vec(w2v_corpus, workers = worker_no, size=vector_size) model.save(model_name) print "Model Created Successfully" # Load the Model def load_model(path = "word2vec_model"): """ loads the stored word2vec model """ name = Word2Vec.load(path) return name if __name__ == "__main__": train_data = get_data(sys.argv[1]) train_word2vec(train_data)
import pygal from preprocessing import get_data, sortListToDicts year = 2014 filename = 'YouthLiteracyRate.csv' wm = pygal.maps.world.World() wm.title = 'Literacy rate, youth total (% of people ages 15-24)' country_name, country_code, data = get_data(filename, year) no_data, less_than_50, less_than_75, more_than_75 = sortListToDicts( country_code, data) wm.add("No data", no_data) wm.add("< 50%", less_than_50) wm.add("< 75%", less_than_75) wm.add("> 75 %", more_than_75) wm.render_to_file('test.svg')
from keras.layers import Input, Dense from keras.models import Model import tensorflow as tf from keras import optimizers, regularizers, backend as K from sklearn.metrics import classification_report from sklearn.preprocessing import StandardScaler from matplotlib import pyplot as plt import seaborn as sn # %matplotlib inline from preprocessing import get_data import Utils #for NLS-KDD train, test, indexes = get_data("multiclass") train_label = train.label train = train.drop(["label"], axis=1) Scaler = StandardScaler() train = Scaler.fit_transform(train.values)[np.where(train_label == 1)] xtest, ytest = Scaler.transform(test.drop(["label"], axis=1)), test.label.values def fit_model(params, X, latent=10, BS=250, ep=95): input_dim = X.shape[1] latent_space_size = latent
import catboost as cb from hyperparam_optimizing import ( CATBOOST_BAYESSEARCH_PARAMS, CATBOOST_RANDOMSEARCH_PARAMS, SCORING_LIST, perform_bayes_search, perform_random_search, ) from preprocessing import get_data from scoring import calculate_scores VAL_SPLIT = 0.2 data = get_data(val_split=VAL_SPLIT, apply_label_encoding=True, fillna=True) X_train, X_val, X_test, y_train, y_val, categorical_features = ( data["X_train"], data["X_val"], data["X_test"], data["y_train"], data["y_val"], data["categorical_features"], ) clf = cb.CatBoostClassifier( n_estimators=200, learning_rate=0.05, metric_period=500, od_wait=500, task_type="CPU", depth=8, )
import os from preprocessing import get_data, vectorize_data from gridsearchcv import train from evaluate import get_acc, print_report, caculate_confidence, predict # Path to file root_path = os.path.dirname(__file__) model_path = os.path.join(root_path, "result/model.sav") report_path = os.path.join(root_path, "result/report.xlsx") train_file = os.path.join(root_path, "data/train.txt") test_file = os.path.join(root_path, "data/test.txt") # Get data X_train, y_train = get_data(train_file) X_test, y_test = get_data(test_file) # Vectorizer X_train, y_train, X_test, y_test, vectorizer, le = vectorize_data( X_train, y_train, X_test, y_test) print(f"Shape of X_train: {X_train.shape}") print(f"Shape of X_test : {X_test.shape}\n") print(f"Shape of y_train: {y_train.shape}") print(f"Shape of y_test : {y_test.shape}\n") print(f"Ratio: {len(X_train)/len(X_test)}") # Training
import matplotlib matplotlib.use('Agg') print('\n\n\nRunning\n\n\n') from model import CNN from preprocessing import get_data import matplotlib.pyplot as plt import matplotlib from sklearn.model_selection import train_test_split from keras.models import load_model from keras.preprocessing.image import ImageDataGenerator from keras.callbacks import LearningRateScheduler X, y = get_data(amount=42000) im_shape = X[0].shape train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.1) print(im_shape) iterate = False augment = True if augment: datagen = ImageDataGenerator(rotation_range=10, zoom_range=0.1, width_shift_range=0.1, height_shift_range=0.1) annealer = LearningRateScheduler(lambda x: 1e-3 * 0.95**x) if not iterate: def train_CNN(data_portion=X.shape[0], epochs=20, ensemble=False):
#### model parameters #### cnn_layers = [32 , 64 , 128] cnn_kernels = [3 , 3 , 3] cnn_dropout = [.5 , .5 , .5] lstm_layers = [128] lstm_dropout = [.5] vector_size = 128 lr = 0.001 epochs = 20 batch_size = 64 ntest_sers = 1000 verbose = True ####### preprocessing and data ########## data_csv_path = 'data/taonews.csv' embedding_pretrained_model_path = 'data/glove.6B.100d.txt' ################################## from preprocessing import get_data from model import model, train X, Y = get_data(data_csv_path,embedding_pretrained_model_path) model = model(X,Y,cnn_layers,cnn_kernels,cnn_dropout,lstm_layers,lstm_dropout,vector_size) #training train(model,X,Y,lr,epochs,batch_size,ntest_sers, verbose=True)
def main(): X_train, X_test, y_train, y_test = get_data(type_of_data='Default') score = nn(X_train, X_test, y_train, y_test) print(score)
from scipy.cluster.hierarchy import ward, dendrogram import preprocessing import pandas as pd import matplotlib.pyplot as plt import matplotlib as mpl from sklearn.metrics.pairwise import cosine_similarity tfidf_matrix, titles, ranks, synopses, genres, vocab_frame, terms = preprocessing.get_data( ) dist = 1 - cosine_similarity(tfidf_matrix) linkage_matrix = ward( dist ) #define the linkage_matrix using ward clustering pre-computed distances fig, ax = plt.subplots(figsize=(15, 20)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=titles) plt.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='False', # ticks along the bottom edge are off top='False', # ticks along the top edge are off labelbottom='False') plt.tight_layout() #show plot with tight layout #uncomment below to save figure plt.savefig('ward_clusters.png', dpi=200)
# coding: utf-8 # In[1]: #get_ipython().system(u'jupyter nbconvert --to script Keras_Sentence_RNN.ipynb') import preprocessing import numpy as np # In[2]: import collections sentence_max_threshold = 50000 tokenizer, max_sentence_len_word, labels, train_X, test_X, train_y, test_y = preprocessing.get_data( sentence_max_threshold) print train_X.shape, train_y.shape, len( tokenizer.word_counts) #, len(tokenized_text) x_count = collections.Counter() for i in range(len(test_y)): x_count.update({str(test_y[i]): 1}) for key, value in sorted(x_count.iteritems(), reverse=True): print key, value, float(value) / sentence_max_threshold # ### Use Keras_Sentence_RNN.py to avoid time-out problem # If the trained model runs too long, it will time out. To get around this issue, you can skip the run here and instead use Keras_Sentence_RNN.py to train and save the model, then load the saved model here to predict the data. # # In[5]:
import networkx as nx level = 3 numofkeys = 1 #the number of mainkeywords #2**level -1 #sum of G.P. with common ratio = 2 from preprocessing import get_data, word_by_sent, wbys_to_word, word_to_idx, idx_by_sent text = get_data() wbys = word_by_sent(text) wordlist = wbys_to_word(wbys) wtoi = word_to_idx(wordlist) ibys = idx_by_sent(wbys, wtoi) from textrank import count_window, textrank_keyword counter = count_window(ibys, 5) mainkeywords = textrank_keyword(ibys, wordlist, numofkeys) import visualization as vis cnt_draw = vis.counter_draw(counter, wordlist) IG = vis.initialGraph(cnt_draw, wordlist) # vis.drawgraph(IG, cmap = "Blues", nodesize = 350, graphtype = None, savepath=None, show = True) vis.communityGraph(IG) # vis.drawgraph(IG, cmap = "Pastel1", nodesize = 350, graphtype = "community", savepath="community.png", show = False) # energy_SG = vis.subGraph(IG, "energy") # vis.drawgraph(energy_SG, cmap = "Oranges", nodesize = 350, graphtype = None, savepath="subgraph.png", show = False) # energy_SCG = vis.subCommunityGraph(IG, "energy") #only after communityGraph() method # vis.drawgraph(energy_SCG, cmap = "Pastel1", nodesize = 350, graphtype = "community", savepath="subcommunity.png", show = False) """ The core of this project:
from sklearn import metrics import preprocessing from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.ensemble import BaggingClassifier from sklearn import tree x_train, y_train, x_test, y_test = preprocessing.get_data() model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1)) model.fit(x_train, y_train) y_pred = model.predict(x_test) y_pred = np.asarray((y_pred)) confusion_matrix = confusion_matrix(y_test, y_pred) print(confusion_matrix) print(classification_report(y_test, y_pred))
import numpy as np import preprocessing import postprocessing import matplotlib.pyplot as plt import sklearn, sklearn.tree, sklearn.model_selection, sklearn.ensemble ftcount = 531 datafile = 'Dataset/dataset.train' train = preprocessing.get_data(datafile, ftcount) trainm = preprocessing.mask_unused_features(train) x = [] meany = [] for t in range(2, 11): results = [] x = [] meany = [] sdy = [] for t in 2, 3, 4, 5, 6, 7, 8, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 150, 180, 200, 250: results = [] for i in range(1, 5): rf = sklearn.ensemble.GradientBoostingClassifier(max_depth=t) cv_rf = sklearn.model_selection.cross_val_score(rf, trainm[:, :-1],
import time # Load Hyperparameters epochs = params['epochs'] batch_size = params['batch_size'] rnn_size = params['rnn_size'] num_layers = params['num_layers'] encoding_embedding_size = params['encoding_embedding_size'] decoding_embedding_size = params['decoding_embedding_size'] learning_rate = params['learning_rate'] learning_rate_decay = params['learning_rate_decay'] min_learning_rate = params['min_learning_rate'] keep_probability = params['keep_probability'] # Preprocess data, get the vocabularies questions, answers = get_data() sorted_questions, sorted_answers, questionswords2int, answerswords2int = preprocess_data( questions, answers) # Splitting the questions and answers into training and validation sets training_validation_split = int(len(sorted_questions) * 0.15) training_questions = sorted_questions[training_validation_split:] training_answers = sorted_answers[training_validation_split:] validation_questions = sorted_questions[:training_validation_split] validation_answers = sorted_answers[:training_validation_split] # Training batch_index_check_validation_loss = ( (len(training_questions)) // batch_size // 2) - 1 total_training_loss_error = 0 list_validation_loss_error = []
# For correct argument parsing def str2bool(arg): if isinstance(arg, bool): return arg if arg.lower() in ('yes', 'true', 't', 'y', '1'): return True elif arg.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') # Get datasets print("Preparing data and tokenizer...") train_data, validation_data, test_data, tokenizer = get_data() # Initialize argument parser parser = argparse.ArgumentParser() # Model selection, device selection parser.add_argument('--model', type=str, default="vae", help='Select model to use') parser.add_argument('--device', type=str, default=device, help='Select which device to use') # Standard model parameters
import nltk from preprocessing import get_data from n_gram import count_n_grams,suggest_a_word if __name__ == "__main__": tokenized_sentences , word_counts = get_data() print("building n-gram model 🚀🚀") unique_words = list(word_counts.keys()) unigram_counts = count_n_grams(tokenized_sentences, 1) bigram_counts = count_n_grams(tokenized_sentences, 2) print("Finshed building the model 🎯") print("Some results from the model 👀 :-") texts = ["how","i like","you","please","i need","give me your","allow us to"] for text in texts: previous_tokens = nltk.word_tokenize(text) suggestion, max_prob = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0) print("Text :",text) print("Suggestion :",suggestion) # print(f"Suggestion : {suggestion} -> {int(max_prob*100)}%") print("----------------------------------")