def predict(self): if self.tweet_pred is None: raise Exception( 'Can not Start Predicting without any Prediction Tweet!') # perform pre-processing clean_tweet_pred = Utils.preprocess_tweet(self.tweet_pred) # build doc list by duplicate tweet prediction foreach line in bot list tweet_pred_list = [clean_tweet_pred] * len(self.bot_list) # convert tweet predicted to sequence temp_pred_list = [clean_tweet_pred] x_temp_pred_list = Utils.convert_text_to_sequences( self.tokenizer, temp_pred_list, self.max_text_len) # duplicate sequence to the length of bot size list x_doc_list = [x_temp_pred_list[0]] * len(self.bot_list) x_doc_list = np.array(x_doc_list) # calculate word overlapping additional feature if self.additional_feats_enabled: additional_feat = Utils.compute_overlap_features( self.bot_list, tweet_pred_list) else: additional_feat = np.zeros(len(self.bot_list)) # perform the prediction operation predict_list = self.model.predict( [self.x_bot_list, x_doc_list, additional_feat], verbose=1, callbacks=[self.callback_predict]) # calculate and save the how much current tweet similar to training bots list self.bot_similarity_score = len( list(filter(lambda x: x > 0.5, predict_list))) / len(predict_list)
def train_model(self): # load exists dataset or create a new one if not exists #self._load_dataset() # build dataset for training self.dataset.perform_build(self.bots_file, self.human_file, self.additional_feats_enabled) self.logger.write_log('Splitting datasets into train and test sets') data_train, data_test = self._split_train_test_sets() q_train, d_train, addn_feat_train, y_train = data_train q_test, d_test, addn_feat_test, y_test = data_test self.logger.write_log(f'trains samples: {len(q_train)}') self.logger.write_log(f'test samples: {len(q_test)}') # extract some parameters that uses for our model vocabulary = self.dataset.tokenizer.index_word max_text_len = self.dataset.max_text_len addit_feat_len = self.dataset.addit_feat_len tokenizer = self.dataset.tokenizer # convert texts to sequences self.logger.write_log('convert texts to sequences') x_q_train = Utils.convert_text_to_sequences(tokenizer, q_train, max_text_len) x_d_train = Utils.convert_text_to_sequences(tokenizer, d_train, max_text_len) x_q_test = Utils.convert_text_to_sequences(tokenizer, q_test, max_text_len) x_d_test = Utils.convert_text_to_sequences(tokenizer, d_test, max_text_len) # prepare data for predicting self.bot_tweets = self._get_unique_matches(q_train, y_train) self.x_bot_tweets = Utils.convert_text_to_sequences( tokenizer, self.bot_tweets, max_text_len) self.bot_test_tweets = q_test self.doc_test_tweets = d_test self.labels_test = y_test # create our model with embedding matrix self.model = self._create_model(vocabulary, max_text_len, addit_feat_len) self.logger.write_log(f'Start training process..') # start fitting model history = self.model.fit([ np.array(x_q_train), np.array(x_d_train), np.array(addn_feat_train) ], np.array(y_train), epochs=self.epochs, batch_size=self.batch_size, verbose=1, validation_split=self.validation_split, callbacks=self._get_callbacks())