def basic_filter(self, question): question = json.loads(question) # 0, 1 = Spam, Ham # importing the learner clf = joblib.load( os.path.join(BASE_DIR + '/spam_filter/', 'learner.pkl')) # establishing the connection to the database connection = local_connection('spam_filter') if connection: try: with connection.cursor() as cursor: if self.is_repeat(cursor, question): return json.dumps({'status': 0}) else: qs = self.clean_data(question) if qs['body'] == '': return json.dumps({'status': 0}) else: prediction = clf.predict_proba([qs['body']]) question_as_list = qs['body'].split(' ') weight = WORD_PER_PROBABILITY * MINIMUM_NUMBER_OF_WORDS -\ (len(question_as_list) * WORD_PER_PROBABILITY) if prediction[0][0] <= 0.5 and weight > 0: prediction[0][0] += weight prediction[0][1] -= weight for words in abusive_words + abusive_words_bangla: if words in question_as_list: if prediction[0][0] <= 0.5: prediction[0][ 0] += ABUSIVE_WORDS_PROBABILITY prediction[0][ 1] -= ABUSIVE_WORDS_PROBABILITY else: break if prediction[0][0] > prediction[0][1]: return json.dumps({'status': 0}) return json.dumps({'status': 1}) except Exception: self.logger.error('Exception in getting probability', exc_info=True) return json.dumps({'status': 1}) finally: connection.close()
def match_question(connection_cursor, id, body): match_sql = "SELECT id FROM questions WHERE id<" + id + " and body ='" + body + "'" connection_cursor.execute(match_sql) return connection_cursor.fetchone() def insert_spam_ham(connection_instance, connection_cursor, id, body, status): insert_sql = "INSERT INTO filtered_questions(questions_id, body, status) " \ "VALUES('" + id + "','" + body + "','" + status + "')" connection_cursor.execute(insert_sql) connection_instance.commit() if __name__ == '__main__': connection = local_connection('spam_filter') # decoder instance decoder = Decoder() if connection: try: with connection.cursor() as cursor: clean_questions(cursor, connection, decoder) data = select_questions(cursor) for i in data: is_matched = match_question(cursor, str(i['id']), i['body']) if not is_matched: if i['status'] != 'spam': i['status'] = 'ham'