Пример #1
0
def test_with_unlabeled_state(data_filename, replacer,classifier, bow, tfidf):
    
    data = pd.read_csv(data_filename)
    bow_transformer = joblib.load(bow)
    tfidf_transformer = joblib.load(tfidf)
    classifier = joblib.load(classifier)
    data_texts = data['content']
    worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer)
    word_list = preprocess(data_texts, replacer)
    predict_data = worker.classifier_predict(word_list,classifier)
    predict_data.to_csv('predict_data.csv')
Пример #2
0
def job(csv_filename=csv_filename, bow_transformer=bow_transformer, classifier=classifier, tfidf_transformer=tfidf_transformer, host=host, passcode=passcode, charset=charset, usr=usr, db=db,time_storage=time_storage):
    #Start to load synonym pairs to SynonymReplacer
    logger.info("Start to load synonym pairs to SynonymReplacer:")
    try:
        replacer = SynonymReplacer()
        replacer.addSynDict_from_csv(csv_filename)
        logger.info("Successfully load synonym pairs to SynonymReplacer.")
    except Exception as err:
        logger.error("Fail to load synonym pairs to SynonymReplacer. {}".format(err))
   
    #replacer.show_syndict()
    #forest = random_forest_generator('./var/model/data.csv',replacer)
    #test_with_labeled_state('./var/model/model_data.csv',replacer,'./var/model/forest_0','./var/model/bow_0','./var/model/tfidf_0')

    error_flag = False

    logger.info("Start to load lasttime from time_storage.txt:")
    try:
        with open(time_storage,'r') as file:
            lasttime = file.readline()
        logger.info("Successfully load lasttime from time_storage.txt.")
    except Exception as err:
        lasttime = time.strftime( '%Y-%m-%d %X', time.localtime())
        logger.error("Fail to load lasttime from time_storage.txt. Instead, set current time to be lasttime. {}".format(err))




    #Connect to the database
    logger.info("Start to connect to the database:")
    try:
        connection = pymysql.connect(host=host,
                                 user= usr,
                                 password= passcode,
                                 db= db,
                                 charset=charset,
                                 cursorclass=pymysql.cursors.DictCursor)
        logger.info("Successfully connect to the database.")
        
        with connection.cursor() as cursor:
            #retrieve data from the database
            newtime = str(time.strftime( '%Y-%m-%d %X', time.localtime()))
            flag = cursor.execute("SELECT * FROM GC_User_Comment WHERE CreatedTime >= %s and CreatedTime < %s",(lasttime,newtime))
            if flag:
                Recentdata = pd.DataFrame(cursor.fetchall())
                Recentdata = Recentdata[['CMTID','Content','State','AuditState']]
                logger.info("Successfully retrieve {} comments from database.".format(len(Recentdata)))
                            
                logger.info("Start to load models:")
                try:
                    classifier = joblib.load(classifier)
                    bow_transformer = joblib.load(bow_transformer)
                    tfidf_transformer = joblib.load(tfidf_transformer)
                    logger.info("Successfully load models.")
                
                except Exception as err:
                    logger.error("Fail to load models. {}".format(err))
                    error_flag = True
            
                logger.info("Start to preprocecss Recentdata:")
                try:
                    data_texts = Recentdata['Content']
                    word_list = preprocess(data_texts, replacer)
                    logger.info("Successfully preprocess Recentdata.")
                except Exception as err:
                    logger.error("Fail to preprocess Recentdata. {}".format(err))
                    error_flag = True
                                 
                logger.info("Start to predict Recentdata:")
                try:
                    worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer)
                    predict_data = worker.classifier_predict(word_list,classifier)
                    predict_data.to_csv('predict_data.csv')
                    logger.info("Successfully predict Recentdata")
                    Recentdata['State'] = [state_classify(prob) for prob in predict_data['trash']]
                    Recentdata['AuditState'] = [auditstate_classify(prob) for prob in predict_data['trash']]
                except Exception as err:
                    logger.error("Fail to predict Recentdata. {}".format(err))
                    error_flag = True
                
                logger.info("Start to update State and AuditState in the database:")
                try:
                    count = 0
                    for CMTID, Content, State, AuditState in Recentdata.values:
                        cursor.execute("UPDATE GC_User_Comment SET State = %s, AuditState = %s WHERE CMTID = %s",(int(State), int(AuditState), int(CMTID)))
                        count = count + 1
                    connection.commit()
                except Exception as err:
                    logger.error("Fail to update GC_User_Comment State. {0}".format(err))
                    error_flag = True
                
                if(not error_flag):
                    #update lasttime into txt file
                    logger.info("Start to update time in time_storage.txt:")
                    try:
                        with open(time_storage,'w') as file:
                            file.writelines(newtime)
                        logger.info("Successfully updated time in time_storage.txt")
                        logger.info("Successfully updated State and AuditState for {} comment".format(count))
                    except Exception as err:
                        logger.error("Fail to update time in time_storage.txt. {}".format(err))
                                 

            else:
                logger.info("No new comment was created from the last recorded CreatedTime")
        connection.close()
    except Exception as err:
        logger.error("Fail to connect to the database. {}".format(err))