예제 #1
0
    def predict(self):
        if self.tweet_pred is None:
            raise Exception(
                'Can not Start Predicting without any Prediction Tweet!')

        # perform pre-processing
        clean_tweet_pred = Utils.preprocess_tweet(self.tweet_pred)

        # build doc list by duplicate tweet prediction foreach line in bot list
        tweet_pred_list = [clean_tweet_pred] * len(self.bot_list)

        # convert tweet predicted to sequence
        temp_pred_list = [clean_tweet_pred]
        x_temp_pred_list = Utils.convert_text_to_sequences(
            self.tokenizer, temp_pred_list, self.max_text_len)

        # duplicate sequence to the length of bot size list
        x_doc_list = [x_temp_pred_list[0]] * len(self.bot_list)
        x_doc_list = np.array(x_doc_list)

        # calculate word overlapping additional feature
        if self.additional_feats_enabled:
            additional_feat = Utils.compute_overlap_features(
                self.bot_list, tweet_pred_list)
        else:
            additional_feat = np.zeros(len(self.bot_list))

        # perform the prediction operation
        predict_list = self.model.predict(
            [self.x_bot_list, x_doc_list, additional_feat],
            verbose=1,
            callbacks=[self.callback_predict])

        # calculate and save the how much current tweet similar to training bots list
        self.bot_similarity_score = len(
            list(filter(lambda x: x > 0.5, predict_list))) / len(predict_list)
예제 #2
0
    def train_model(self):
        # load exists dataset or create a new one if not exists
        #self._load_dataset()

        # build dataset for training
        self.dataset.perform_build(self.bots_file, self.human_file,
                                   self.additional_feats_enabled)

        self.logger.write_log('Splitting datasets into train and test sets')

        data_train, data_test = self._split_train_test_sets()
        q_train, d_train, addn_feat_train, y_train = data_train
        q_test, d_test, addn_feat_test, y_test = data_test

        self.logger.write_log(f'trains samples: {len(q_train)}')
        self.logger.write_log(f'test samples: {len(q_test)}')

        # extract some parameters that uses for our model
        vocabulary = self.dataset.tokenizer.index_word
        max_text_len = self.dataset.max_text_len
        addit_feat_len = self.dataset.addit_feat_len
        tokenizer = self.dataset.tokenizer

        # convert texts to sequences
        self.logger.write_log('convert texts to sequences')
        x_q_train = Utils.convert_text_to_sequences(tokenizer, q_train,
                                                    max_text_len)
        x_d_train = Utils.convert_text_to_sequences(tokenizer, d_train,
                                                    max_text_len)
        x_q_test = Utils.convert_text_to_sequences(tokenizer, q_test,
                                                   max_text_len)
        x_d_test = Utils.convert_text_to_sequences(tokenizer, d_test,
                                                   max_text_len)

        # prepare data for predicting
        self.bot_tweets = self._get_unique_matches(q_train, y_train)
        self.x_bot_tweets = Utils.convert_text_to_sequences(
            tokenizer, self.bot_tweets, max_text_len)

        self.bot_test_tweets = q_test
        self.doc_test_tweets = d_test
        self.labels_test = y_test

        # create our model with embedding matrix
        self.model = self._create_model(vocabulary, max_text_len,
                                        addit_feat_len)

        self.logger.write_log(f'Start training process..')

        # start fitting model
        history = self.model.fit([
            np.array(x_q_train),
            np.array(x_d_train),
            np.array(addn_feat_train)
        ],
                                 np.array(y_train),
                                 epochs=self.epochs,
                                 batch_size=self.batch_size,
                                 verbose=1,
                                 validation_split=self.validation_split,
                                 callbacks=self._get_callbacks())