예제 #1
0
    def basic_filter(self, question):
        question = json.loads(question)
        # 0, 1 = Spam, Ham
        # importing the learner
        clf = joblib.load(
            os.path.join(BASE_DIR + '/spam_filter/', 'learner.pkl'))
        # establishing the connection to the database
        connection = local_connection('spam_filter')
        if connection:
            try:
                with connection.cursor() as cursor:
                    if self.is_repeat(cursor, question):
                        return json.dumps({'status': 0})
                    else:
                        qs = self.clean_data(question)
                        if qs['body'] == '':
                            return json.dumps({'status': 0})
                        else:
                            prediction = clf.predict_proba([qs['body']])
                            question_as_list = qs['body'].split(' ')
                            weight = WORD_PER_PROBABILITY * MINIMUM_NUMBER_OF_WORDS -\
                                     (len(question_as_list) * WORD_PER_PROBABILITY)

                            if prediction[0][0] <= 0.5 and weight > 0:
                                prediction[0][0] += weight
                                prediction[0][1] -= weight
                            for words in abusive_words + abusive_words_bangla:
                                if words in question_as_list:
                                    if prediction[0][0] <= 0.5:
                                        prediction[0][
                                            0] += ABUSIVE_WORDS_PROBABILITY
                                        prediction[0][
                                            1] -= ABUSIVE_WORDS_PROBABILITY
                                    else:
                                        break
                            if prediction[0][0] > prediction[0][1]:
                                return json.dumps({'status': 0})
                    return json.dumps({'status': 1})
            except Exception:
                self.logger.error('Exception in getting probability',
                                  exc_info=True)
                return json.dumps({'status': 1})
            finally:
                connection.close()
예제 #2
0
def match_question(connection_cursor, id, body):
    match_sql = "SELECT id FROM questions WHERE id<" + id + " and body ='" + body + "'"
    connection_cursor.execute(match_sql)
    return connection_cursor.fetchone()


def insert_spam_ham(connection_instance, connection_cursor, id, body, status):
    insert_sql = "INSERT INTO filtered_questions(questions_id, body, status) " \
                 "VALUES('" + id + "','" + body + "','" + status + "')"
    connection_cursor.execute(insert_sql)
    connection_instance.commit()


if __name__ == '__main__':
    connection = local_connection('spam_filter')
    # decoder instance
    decoder = Decoder()

    if connection:
        try:
            with connection.cursor() as cursor:
                clean_questions(cursor, connection, decoder)
                data = select_questions(cursor)
                for i in data:
                    is_matched = match_question(cursor, str(i['id']),
                                                i['body'])

                    if not is_matched:
                        if i['status'] != 'spam':
                            i['status'] = 'ham'