def train_bert_model(self, use_generator=False): data_path = self._para.data_path _, train_y, _, _, _, val_y, _, tags = pickle.load(open(data_path+'/nlp_ner.pk', 'rb')) self._para.tag_num = len(tags) model = ModelLib.BERT_MODEL(self._para) checkpoint = ModelCheckpoint(data_path+'/bert-bilstm', monitor='val_viterbi_acc', verbose=1, save_best_only=True, mode='max') pre_process = preProcess(self._para) if use_generator: val_x = pre_process.load_bert_test(data_path+'/dev.txt', self._para.sep) model.fit_generator( Generator(self._para).bert_generator(self._para.batch_size, data_path+'/train.txt', self._para.sep, train_y, shuffle=True), steps_per_epoch=train_y.shape[0] // self._para.batch_size + 1, callbacks=[checkpoint], validation_data=(val_x, val_y), epochs=self._para.EPOCHES, verbose=1) else: train_x, val_x = pre_process.load_bert_train_dev() logger.info('%s, %s' % (train_x.shape, train_y.shape)) logger.info('%s, %s' % (val_x.shape, val_y.shape)) model.fit(train_x, train_y, batch_size=self._para.batch_size, epochs=self._para.EPOCHES, callbacks=[checkpoint], validation_data=(val_x, val_y), shuffle=True, verbose=1) model.save(data_path+'/bert-bilstm')
def trainingModel(): # load data data = pd.read_excel("../Ica_Labelled Tweets (selesai).xlsx", index_col=None, sheet_name='tweets_text', skiprows=[0, 1, 2], na_values=['-', ' ']) cleaned_data = preProcess(data) # train test split X = cleaned_data[['tweet']] y = cleaned_data[['label']] X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.3, random_state=42) # train model model = XGBClassifier() model.fit(X_train, y_train) # test model & evaluate predictions y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # save model model_filename = 'final_model.sav' pickle.dump(model, open(model_filename, 'wb'))
def test(para): logger.info('=======Start testing process=======') data_path = data_path = para.data_path _, _, _, test_y, _, _, _, tags = pickle.load(open(data_path + '/nlp_ner.pk', 'rb')) para.tag_num = len(tags) model = ModelLib.BERT_MODEL(para) model.load_weights(filepath=data_path+'/bert-bilstm') pre_process = preProcess(para) test_x = pre_process.load_bert_test() lengths = pre_process.get_lengths(test_y) pred_y = model.predict(test_x) tag_pred_y = [] tag_test_y = [] for i, y in enumerate(pred_y): y = [np.argmax(dim) for dim in y] # logger.info(lengths[i]) p_y = y[:lengths[i]] # logger.info(p_y) t_y = test_y[i][:lengths[i]].flatten() # logger.info(t_y) p_y = [tags[dim] for dim in p_y] t_y = [tags[dim] for dim in t_y] tag_pred_y.append(p_y) tag_test_y.append(t_y) accuracy = ner_accuracy(tag_pred_y, tag_test_y) logger.info('Test accuracy %f' % accuracy) precision, recall, f1_score = F1(tag_pred_y, tag_test_y) logger.info('Precision=%f, Recall=%f, f1-score=%f' % (precision, recall, f1_score)) return tag_pred_y, tag_test_y
def submit(): inputData = {'tweet': [txtInput.get()]} data = pd.DataFrame(data=inputData) cleaned_data = preProcess(data) cleaned_data = featureExtraction(cleaned_data) result = predictInput(cleaned_data) result.insert(loc=0, column='tweet', value=[txtInput.get()]) lblOutput.configure(text=str(result))
def make_predictions_wraper(file_path='test.csv', df_=None, return_predictions=False, save_name=None): # Select the ones who missed the test if df_ is None: df = pd.read_csv(file_path, index_col='NU_INSCRICAO') else: df = df_.copy() zeros = df.isnull().query('NU_NOTA_LC')[[]] zeros['NU_NOTA_MT'] = 0 # Get ID of columns to be predicted predictions = df[['NU_NOTA_LC']].dropna()[[]] #.index # Load data if df_ is None: features, pipe = preProcess(path=file_path, train=False) else: features, pipe = preProcess(df_=df_, train=False) # Load models try: with open('models.pkl', 'rb') as f: models, metrics = pickle.load(f) except: raise ValueError('Model File Missing') # Make predictions predictions['NU_NOTA_MT'] = predictions_ensemble( features, models, ensamble_method=ensamble_method).values predictions_full = pd.concat([zeros, predictions]) if save_name is not None: predictions_full.to_csv(save_name) if return_predictions: return predictions_full
def train_wraper(file_path = 'train.csv', return_arg = False, plot = False, verbose = True, pipe_path = 'pipe.pkl'): features, target = preProcess(path = file_path) models, metrics = create_models(features, target, plot = plot, verbose = verbose, pipe_path = pipe_path) if return_arg: return models, metrics
def prepare_traindata(): #POPULATES feature_list and featureAndSentiment print("building the train dataset....") with open("/home/uday/DjangoProjects/stocks/prediction/Sentiment/train_data.csv",'rb') as csvfile: csv_reader = csv.reader(csvfile) for row in csv_reader: tweet = row[0] sentiment = row[1] featureVector = preProcess(tweet) for feature in featureVector: feature_list.append(feature) featuresAndSentiment.append((featureVector,sentiment)) csvfile.close()
def make_predictions_wraper_stacking(file_path='test.csv', df_=None, return_predictions=False, save_name=None, models_path='models-stack.pkl'): # Select the ones who missed the test if df_ is None: df = pd.read_csv(file_path, index_col='NU_INSCRICAO') else: df = df_.copy() zeros = df.isnull().query('NU_NOTA_LC')[[]] zeros['NU_NOTA_MT'] = 0 # Get ID of columns to be predicted predictions = df[['NU_NOTA_LC']].dropna()[[]] #.index # Load data if df_ is None: features, pipe = preProcess(path=file_path, train=False) else: features, pipe = preProcess(df_=df_, train=False) # Make predictions predictions['NU_NOTA_MT'] = make_predictions_stacking( features, models_path=models_path) predictions_full = pd.concat([zeros, predictions]) if save_name is not None: predictions_full.to_csv(save_name) if return_predictions: return predictions_full
def bert_generator(self, batch_size, data_path, sep, y, shuffle=True): index_array = np.arange(y.shape[0]) if shuffle: np.random.shuffle(index_array) _parse_data = preProcess(self._para)._parse_data data = _parse_data(codecs.open(data_path, 'r'), sep=sep) data = [[item[0] for item in sent] for sent in data] batches = self.make_batches(y.shape[0]-1, batch_size) le = LEmbedding(self._para) while True: for batch_index, (batch_start, batch_end) in enumerate(batches): batch_idx = index_array[batch_start:batch_end] batch_data = [data[idx] for idx in batch_idx] batch_x = le.embedding(batch_data) batch_x = batch_x[:, 1:self._para.max_len+1] batch_y = y[batch_idx] yield batch_x, batch_y
def uploadAction(event=None): # python 3 import file using tkinter filename = filedialog.askopenfilename() # python 2 import file using tkinter # filename = tkFileDialog.askopenfilename() if not filename.endswith(".csv") and not filename.endswith(".xlsx"): lblOutput.configure(text="Wrong input file") else: # TODO: Read input file and process if filename.endswith(".xlsx"): data = pd.read_excel(filename) else: data = pd.read_csv(filename, encoding="ISO-8859-1") cleaned_data = preProcess(data) cleaned_data = featureExtraction(cleaned_data) result = predictInput(cleaned_data) result.insert(loc=0, column='tweet', value=data['tweet']) lblOutput.configure(text=str(result))
def Poly(): X, y = preprocess.preProcess() y = np.reshape(y, (X.shape[0], 1)) alpha = 0 deg = X.shape[1] model = Ridge(alpha=alpha, solver='auto', random_state=42) model = Pipeline([ ("poly_features", PolynomialFeatures(degree=deg, include_bias=True)), ("std_scaler", StandardScaler()), ("regul_reg", model), ]) model.fit(X, y) #y_pred = model.predict(x_test) cross_valid = cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=5) print('Cross Validation Errors:', -np.mean(cross_valid)) print('Theta: \n', model.named_steps["regul_reg"].coef_)
def get_class(): data = request.get_json() data = preProcess(data['data']) svm, nb, knn, lr, rf = predict_label(data) k, l, m, n, f = predictD2Vclass(data) a, b, c, d, e = predictclass(data) positive = (int(svm) + int(nb) + int(knn) + int(lr) + int(rf) + int(k) + int(l) + int(m) + int(n) + int(a) + int(b) + int(c) + int(d) + int(e)) predicted = model_fasttext.predict(data) if (predicted[0][0] == '__label__1'): fasttextlabel = '1' else: fasttextlabel = '0' fasttextconfidence = predicted[1][0] * 100 return jsonify({ "case": data, "SVM": svm, "Naive Bayes": nb, "k Nearest Neighbour": knn, "Logistic Regression": lr, "Random Forest": rf, "D2VSVM": k, "D2VLR": l, "D2VRf": m, "D2VKNN": n, "BOWRF": e, "BOWSVM": a, "BOWNB": b, "BOWKNN": c, "BOWLR": d, "filenames": f, "cases": positive, "fasttextlabel": fasttextlabel, "fasttextconfidence": fasttextconfidence })
from preprocess import preProcess import os import sys from ann_lib import * inputFile = sys.argv[1] preProcess(inputFile) imglist = [] for img in os.listdir("./tmp"): imglist.append(img) imglist.sort() #call the function to train a model and validate #this call return the plate number as string ann_call_fn(imglist) for fileName in os.listdir("./tmp"): os.remove("./tmp/" + fileName)
#random choose a item from a list def randomChoice(self, l): return l[random.randint(0, len(l) - 1)] #random choose a instance of (category, word) def randomTrainPair(self): category = self.randomChoice(self.all_categories) word = self.randomChoice(self.category_words[category]) return category, word def randomTrainExample(self): category, word = self.randomTrainPair() #print(category) #print(word) category_tensor = self.categoryTensor(category) word_tensor = self.inputTensor(word) target_tensor = self.targetTensor(word) return category_tensor, word_tensor, target_tensor if __name__ == '__main__': from preprocess import preProcess path = r'D:\\Pycharm\\workspcae\\NLP-playing\\data_2\\names\\*.txt' testing = preProcess(path) category_words, all_categories = testing.process() #print(category_words['Chinese']) utils = Utils(category_words, all_categories) category_tensor, word_tensor, target_tensor = utils.randomTrainExample() print(category_tensor) print(word_tensor) print(target_tensor)
book = open_workbook('hinglish.xlsx') sheet = book.sheet_by_index(0) ctr = 0 for row in sheet.col(1): if ctr != 0: posts.append(row.value.encode('utf-8')) ctr += 1 print posts[1] #### Write posts with polirity writeDoc = open('OUTPUT.csv', 'w+') #### STEP 1 - Apply preprocessing for i in range(len(posts)): posts[i] = preprocess.preProcess(posts[i]) #### STEP 2 - Actual work sno = 1 for post in posts: ## Get multipliers totalPol = 0.0 tagdata = tagger.getTag(post) ## List of multiplying factors MFlist = MF.getMF(tagdata) for word in post.split(' '): ## Get tag info
import numpy as np import preprocess import cv2 import os """ POS_IMG_SITE = './img/pos/' NEG_IMG_SITE = './img/neg/' """ TEST_IMG_SITE = './img/test/' POS_IMG_SITE = './work/pos/' NEG_IMG_SITE = './work/neg/' if not os.path.exists('./work') == True: preprocess.preProcess() def load(): imgs = [] tags = [] for name in os.listdir(POS_IMG_SITE): imgs.append(cv2.imread(POS_IMG_SITE + name, 0)) tags.append(1) for name in os.listdir(NEG_IMG_SITE): imgs.append(cv2.imread(NEG_IMG_SITE + name, 0)) tags.append(0) return np.asarray(imgs, dtype=float), tags def load_test(): imgs = []
def test_preprocess(): out = preprocess.preProcess(train=False) assert len(out) == 2 assert type(out[0]) == type(np.array([]))
def buildTestVector(tweet): return preProcess(tweet) # takes dataset and returns a vector(combination of words)
def mainFunction(fileName): preProcess(fileName) return getMarksString()
import numpy as np import preprocess import NeuralNetworks import NeuralNetwork2 X, Y = preprocess.preProcess('Teleplay.csv') Y = np.reshape(Y, (Y.shape[0], 1)) X_pred = preprocess.preProcess('New_Teleplay.csv') X_pred.resize(len(X_pred), 89) #nn = NeuralNetworks.NeuralNetwork(X,Y) #nn.train(epochs = 1000) prediction = np.array(NeuralNetwork2.neural(X, Y, X_pred)) prediction.round(decimals=2) #nn.pred(X_pred) np.savetxt("18086809D_Task1.csv", prediction, delimiter=",")
book = xlrd.open_workbook(sys.argv[1]) sheet = book.sheet_by_index(0) ctr = 0 for row in sheet.col(1): if ctr!=0: posts.append(row.value.encode('utf-8')) ctr+=1 #### Write posts with polirity writeDoc = open('OUTPUT.csv','w+') #### STEP 1 - Apply preprocessing for i in range(len(posts)): posts[i] = preprocess.preProcess(posts[i]) #### STEP 2 - Actual work sno = 1 for post in posts: ## Get multipliers totalPol = 0.0 tagdata = tagger.getTag(post) ## List of multiplying factors MFlist = MF.getMF(tagdata) for word in post.split(' '): ## Get tag info
def get_cleaned_data(): data = request.get_json() final = preProcess(data['data']) return jsonify({'data': final})
loss = 0 for i in range(word_tensor.size(0)): output, hidden = self.model(category_tensor, word_tensor[i], hidden) loss += self.criterion(output, target_tensor[i]) loss.backward() for p in self.model.parameters(): p.data.add_(p.grad.data, alpha=-self.lr) return output, loss.item() / word_tensor.size(0) if __name__=='__main__': path = r'D:\\Pycharm\\workspcae\\NLP-playing\\data_2\\names\\*.txt' pp = preProcess(path) category_words, all_categories = pp.process() # print(category_words['Chinese']) utils = Utils(category_words, all_categories) #category_tensor, word_tensor, target_tensor = utils.randomTrainExample() all_letters = string.ascii_letters + " .,;'-" input_size = len(all_letters) + 1 hidden_size = 128 model = RNN(len(all_categories), input_size, hidden_size, input_size) train = Train(model) n_iters = 100000 print_every = 5000