def start_predict(self): # get predicting parameters model_name = str(self.ui.combobox_model.currentText()) tweets_file = self.ui.textbox_tweets_file.text() header_included = self.ui.checkbox_header.isChecked() random_tweets = self.ui.spinbox_rand_tweets.value() # check for file validity try: Utils.file_validation(tweets_file, 'Tweet') except Exception as ex: Utils.show_msg(text=ex.args[0], title="Input Error") return # reset values of widgets self.reset_form() # disable all widgets self.disable_widgets(True) self.model_callback = CallBackMultiPredictNNet( self.update_batch_progress, self.update_tweet_progress, random_tweets) self.predictor = MultiPredictor(model_name, self.model_callback, tweets_file, header_included, random_tweets) # create a thread for predictor self.pred_thread = ModelPredictorThread(self.predictor) self.pred_thread.finished.connect(self.on_predict_finished) self.pred_thread.start()
def _perform_pre_processing(self, bot_tweet, doc_tweet, length_valid=3): bot_tweet = Utils.preprocess_tweet(bot_tweet) doc_tweet = Utils.preprocess_tweet(doc_tweet) length_valid = True \ if (len(bot_tweet) >= length_valid and len(doc_tweet) >= length_valid) \ else False return bot_tweet, doc_tweet, length_valid
def export_finished(self): if self.export_thread.is_success(): Utils.show_msg(text="Exporting Complete!", title="Successful", msg_type=QMessageBox.Information) os.startfile(self.export_thread.excel_path) else: Utils.show_msg(text=self.export_thread.error, title="Error", msg_type=QMessageBox.Critical)
def on_train_finished(self): if self.model_thread.is_success(): if self.stop_requested: self.log.enable_log() self.log.write_log('Stopped Process Done Successfully!') else: self.ui.btn_save.setDisabled(False) self.log.write_log('Training Process Completed Successfully!') else: Utils.show_msg(text=self.model_thread.error, title="Error") self.change_widgets_disabled(False)
def on_predict_finished(self): self.disable_widgets(False) if not self.need_stop: if self.pred_thread.is_success(): self.classify_tweets() else: Utils.show_msg(text=self.pred_thread.error, title="Error") self.ui.btn_save.setDisabled(True) self.ui.groupbox_threshold.setDisabled(True) else: self.reset_form() self.ui.btn_save.setDisabled(True) self.ui.groupbox_threshold.setDisabled(True)
def start_predict(self): # get predicting parameters model_name = str(self.ui.combobox_model.currentText()) bot_file = self.ui.textbox_bot_file.text() human_file = self.ui.textbox_human_file.text() bot_tweets = self.ui.spinbox_bot_tweets.value() human_tweets = self.ui.spinbox_human_tweets.value() total_tweets = bot_tweets + human_tweets # check for file validity try: Utils.file_validation(bot_file, 'Tweet') Utils.file_validation(human_file, 'Tweet') except Exception as ex: Utils.show_msg(text=ex.args[0], title="Input Error") return # reset values of widgets self.reset_form() # disable all widgets self.disable_widgets(True) self.model_callback = CallBackMultiPredictNNet( self.update_batch_progress, self.update_tweet_progress, total_tweets) self.predictor = ModelTestPredictor(model_name, self.model_callback, bot_file, human_file, bot_tweets, human_tweets) # create a thread for predictor self.pred_thread = ModelPredictorThread(self.predictor) self.pred_thread.finished.connect(self.on_predict_finished) self.pred_thread.start()
def on_predict_finished(self): if self.pred_thread.is_success(): bot_sim_score = self.predictor.get_similarity_score() bot_percentage = int(round(bot_sim_score * 100)) human_percentage = 100 - bot_percentage # calculate the max value for ui scores_arr = [bot_percentage, human_percentage] group_boxes_arr = [self.ui.groupbox_bot, self.ui.groupbox_human] max_element = np.argmax(scores_arr) min_element = 1-max_element self.update_ui_scores(bot_percentage, human_percentage) group_boxes_arr[max_element].setDisabled(False) group_boxes_arr[min_element].setDisabled(True) else: Utils.show_msg(text=self.pred_thread.error, title="Error") self.ui.btn_start.setDisabled(False) self.ui.textbox_tweet.setReadOnly(False) self.ui.combobox_model.setDisabled(False)
def predict(self): if self.tweet_pred is None: raise Exception( 'Can not Start Predicting without any Prediction Tweet!') # perform pre-processing clean_tweet_pred = Utils.preprocess_tweet(self.tweet_pred) # build doc list by duplicate tweet prediction foreach line in bot list tweet_pred_list = [clean_tweet_pred] * len(self.bot_list) # convert tweet predicted to sequence temp_pred_list = [clean_tweet_pred] x_temp_pred_list = Utils.convert_text_to_sequences( self.tokenizer, temp_pred_list, self.max_text_len) # duplicate sequence to the length of bot size list x_doc_list = [x_temp_pred_list[0]] * len(self.bot_list) x_doc_list = np.array(x_doc_list) # calculate word overlapping additional feature if self.additional_feats_enabled: additional_feat = Utils.compute_overlap_features( self.bot_list, tweet_pred_list) else: additional_feat = np.zeros(len(self.bot_list)) # perform the prediction operation predict_list = self.model.predict( [self.x_bot_list, x_doc_list, additional_feat], verbose=1, callbacks=[self.callback_predict]) # calculate and save the how much current tweet similar to training bots list self.bot_similarity_score = len( list(filter(lambda x: x > 0.5, predict_list))) / len(predict_list)
def _generate_additional_feats(self, query_list, doc_list, additional_feats_enabled): if additional_feats_enabled: self.logger.write_log( f'Building additional features between queries and documents') self.overlap_feats = Utils.compute_overlap_features( query_list, doc_list) else: self.logger.write_log( f'Additional features disabled - building not needed') self.overlap_feats = np.zeros(len(query_list)) # determine max feat len if self.overlap_feats.ndim > 1: self.addit_feat_len = self.overlap_feats.shape[1]
def _build_users_dict(self, query_file): self.logger.write_log( f'Building dictionary of each user and its tweets') curr_user = None users_dict = {} # dictionary of lists for line in query_file: user = Utils.get_user_from_tweet(line) if user is not None: # found tweet with username curr_user = user if curr_user is not None: if curr_user not in users_dict: # it's the first tweet of this user, add it users_dict[curr_user] = [line] else: users_dict[curr_user].append(line) # elif line not in users_dict[curr_user]: # append to exists user only if not exists # users_dict[curr_user].append(line) return users_dict
def start_train(self): # get training parameters embedding_file = self.ui.textbox_embed.text() bot_file = self.ui.textbox_bot.text() human_file = self.ui.textbox_human.text() train_split = self.ui.slider_train.value() / 100.0 test_split = 1 - train_split val_split = self.ui.slider_val.value() / 100.0 epoches = self.ui.spinbox_epoches.value() batch_size = self.ui.spinbox_batch.value() addit_feat_enabled = self.ui.checkbox_additional_feats.isChecked() early_stop = self.ui.spinbox_earlystop.value() # get dataset config from combobox gen_method = str(self.ui.combobox_gen_method.currentText()) if gen_method == "User Grouping": dataset_config = DatasetConfig.USER_STATE elif gen_method == "Random Pairing": dataset_config = DatasetConfig.RANDOM_STATE # Check for early stop validity if early_stop > epoches: Utils.show_msg( text= "Can not Insert Early Stop Epochs\nThat Bigger Than Training Epochs Number!", title="Input Error") return # check for files validity try: Utils.file_validation(embedding_file, 'Embedding') Utils.file_validation(bot_file, 'Bot') Utils.file_validation(human_file, 'Human') except Exception as ex: Utils.show_msg(text=ex.args[0], title="Input Error") return # reset progressbars self.ui.progressbar_epoches.setValue(0) self.ui.progressbar_batch.setValue(0) # reset graphs self.reset_graphs() # disable unnecessery widgets when starting training self.change_widgets_disabled(True) self.ui.btn_save.setDisabled(True) self.log.write_log("Start pre-training phase...") # create model instance with all parameters self.custom_callback = CallBackTrainNNet(self.log, self.draw_graphs, self.batch_graphs_clear, self.update_progressbars, self.get_status_stopped, self.MAX_BATCH, self.update_batch_range) self.model = ModelTrainer(logger=self.log, embedding_file=embedding_file, bots_file=bot_file, human_file=human_file, validation_split=val_split, test_split=test_split, batch_size=batch_size, epochs=epoches, additional_feats_enabled=addit_feat_enabled, early_stopping=early_stop, dataset_config=dataset_config, custom_callback=self.custom_callback) # create a thread for training phase self.model_thread = ModelTrainerThread(self.model) self.model_thread.finished.connect(self.on_train_finished) self.model_thread.start() # run the thread to start training
def train_model(self): # load exists dataset or create a new one if not exists #self._load_dataset() # build dataset for training self.dataset.perform_build(self.bots_file, self.human_file, self.additional_feats_enabled) self.logger.write_log('Splitting datasets into train and test sets') data_train, data_test = self._split_train_test_sets() q_train, d_train, addn_feat_train, y_train = data_train q_test, d_test, addn_feat_test, y_test = data_test self.logger.write_log(f'trains samples: {len(q_train)}') self.logger.write_log(f'test samples: {len(q_test)}') # extract some parameters that uses for our model vocabulary = self.dataset.tokenizer.index_word max_text_len = self.dataset.max_text_len addit_feat_len = self.dataset.addit_feat_len tokenizer = self.dataset.tokenizer # convert texts to sequences self.logger.write_log('convert texts to sequences') x_q_train = Utils.convert_text_to_sequences(tokenizer, q_train, max_text_len) x_d_train = Utils.convert_text_to_sequences(tokenizer, d_train, max_text_len) x_q_test = Utils.convert_text_to_sequences(tokenizer, q_test, max_text_len) x_d_test = Utils.convert_text_to_sequences(tokenizer, d_test, max_text_len) # prepare data for predicting self.bot_tweets = self._get_unique_matches(q_train, y_train) self.x_bot_tweets = Utils.convert_text_to_sequences( tokenizer, self.bot_tweets, max_text_len) self.bot_test_tweets = q_test self.doc_test_tweets = d_test self.labels_test = y_test # create our model with embedding matrix self.model = self._create_model(vocabulary, max_text_len, addit_feat_len) self.logger.write_log(f'Start training process..') # start fitting model history = self.model.fit([ np.array(x_q_train), np.array(x_d_train), np.array(addn_feat_train) ], np.array(y_train), epochs=self.epochs, batch_size=self.batch_size, verbose=1, validation_split=self.validation_split, callbacks=self._get_callbacks())
def _get_unique_matches(self, query_train, label_train): lst = list(zip(query_train, label_train)) lst_matches = list(filter(lambda x: x[1] == 1, lst)) bot_matches, _ = list(map(list, zip(*lst_matches))) unique_matches = Utils.remove_duplicates(bot_matches) return unique_matches