def create_category_of_topic_tag_vector(db): #category_of_topic 전체 호출 category_of_topic_list = db[SJ_DB_CATEGORY].find( {'category_name': { '$in': list(SJ_CATEGORY_OF_TOPIC_SET) }}, {'_id': 0}) category_of_topic_list = list(category_of_topic_list) for category in category_of_topic_list: category_tag_vector = FastText.get_doc_vector(category['tag']).tolist() #category_of_topic 콜렉션에 tag_vectort 추가! db[SJ_DB_CATEGORY].update( {'category_name': category['category_name']}, {'$set': { 'tag_vector': category_tag_vector }})
import simple_text_classification import preprocessing import FastText import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression # read the train data usa = preprocessing.train_data(23) usa.to_csv("usa.csv", index=False) usa['category_id'] = usa['tag'].factorize()[0] lr_pred = simple_text_classification.cross_validation_logistic_regression(usa) lr_pred.columns = ['label_id_lr', 'proba_lr', 'label_lr'] ft_pred = FastText.cross_validation_fasttext(usa) ft_pred.columns = ['label_ft', 'proba_ft', 'label_id_ft'] # ensemble stacking method features = pd.concat([lr_pred['label_id_lr'], ft_pred['label_id_ft']], axis=1) labels = usa.category_id [X_train, X_test, y_train, y_test] = train_test_split(features, labels, test_size=0.30, random_state=0) model = LogisticRegression(random_state=1) model.fit(X_train, y_train) model.score(X_test, y_test) # pick the class with the highest probability predictions = pd.concat([
def SJ_interest_measurement_run(): db_client = MongoClient('mongodb://%s:%s@%s' % (MONGODB_ID, MONGODB_PW, MONGODB_HOST)) db = db_client["soojle"] renewal_time = find_variable(db, 'renewal') USER_list = find_user_renewal(db, renewal_time) USER_list = list(USER_list) ACTION_DAY_CHECK = get_default_day(SJ_USER_ACTION_DAY_CHECK) CATEGORY_list = find_all_category_of_topic(db) CATEGORY_list = list(CATEGORY_list) CATEGORY = {} #뉴스피드 관심도 태그 구하기에서 빠른 접근을 위해서 dict형식으로 변환 for cate in CATEGORY_list: CATEGORY[cate['category_name']] = cate['tag'] for USER in USER_list: #좋아요/조회수가 하나도 없는 회원은 측정 안함. if (len(USER['fav_list']) == 0) and (len(USER['view_list']) == 0): continue user_log_backup(db, USER) fav_tag = [] view_tag = [] newsfeed_tag = [] fav_token = [] view_token = [] search_list = [] #사용자가 관심 기능을 수행한 게시물 ########################## fav_topic = (np.zeros(LDA.NUM_TOPICS)) if len( USER['fav_list'] ) <= SJ_USER_LOG_LIMIT['fav'] * SJ_USER_ACTION_NUM_CHECK_PERCENT: for fav in USER['fav_list']: fav_topic += fav['topic'] fav_tag += fav['tag'] fav_token += fav['token'] else: for fav in USER['fav_list']: if fav['date'] < ACTION_DAY_CHECK: continue fav_topic += fav['topic'] fav_tag += fav['tag'] fav_token += fav['token'] #FAS 구하기 fav_doc = (fav_tag + fav_token) * 2 #사용자가 접근을 수행한 게시물 ############################## view_topic = (np.zeros(LDA.NUM_TOPICS)) if len( USER['view_list'] ) <= SJ_USER_LOG_LIMIT['view'] * SJ_USER_ACTION_NUM_CHECK_PERCENT: for view in USER['view_list']: view_topic += view['topic'] view_tag += view['tag'] view_token += view['token'] else: for view in USER['view_list']: if view['date'] < ACTION_DAY_CHECK: continue view_topic += view['topic'] view_tag += view['tag'] view_token += view['token'] #FAS 구하기 view_doc = view_tag + view_token #사용자가 검색을 수행한 키워드 ############################## if len( USER['search_list'] ) <= SJ_USER_LOG_LIMIT['search'] * SJ_USER_ACTION_NUM_CHECK_PERCENT: for search in USER['search_list']: search_list += search['tokenizer_split'] else: for search in USER['search_list']: if search['date'] < ACTION_DAY_CHECK: continue search_list += search['tokenizer_split'] search_topic = LDA.get_topics(search_list) search_doc = search_list #사용자가 접근한 뉴스피드 ################################ A_NUM = 0 #대학교 B_NUM = 0 #동아리&모임 C_NUM = 0 #공모전&행사 D_NUM = 0 #진로&구인 E_NUM = 0 #자유 if len( USER['newsfeed_list'] ) <= SJ_USER_LOG_LIMIT['newsfeed'] * SJ_USER_ACTION_NUM_CHECK_PERCENT: for newsfeed in USER['newsfeed_list']: if newsfeed['newsfeed_name'] == '대학교': A_NUM += 1 elif newsfeed['newsfeed_name'] == '동아리&모임': B_NUM += 1 elif newsfeed['newsfeed_name'] == '공모전&행사': C_NUM += 1 elif newsfeed['newsfeed_name'] == '진로&구인': D_NUM += 1 else: E_NUM += 1 else: for newsfeed in USER['newsfeed_list']: if newsfeed['date'] < ACTION_DAY_CHECK: continue if newsfeed['newsfeed_name'] == '대학교': A_NUM += 1 elif newsfeed['newsfeed_name'] == '동아리&모임': B_NUM += 1 elif newsfeed['newsfeed_name'] == '공모전&행사': C_NUM += 1 elif newsfeed['newsfeed_name'] == '진로&구인': D_NUM += 1 else: E_NUM += 1 newsfeed_tag += newsfeed['tag'] newsfeed_tag += (CATEGORY['대학교'] * A_NUM) + ( CATEGORY['동아리&모임'] * B_NUM) + (CATEGORY['공모전&행사'] * C_NUM) + ( CATEGORY['진로&구인'] * D_NUM) + (CATEGORY['커뮤니티'] * E_NUM) newsfeed_topic = LDA.get_topics(newsfeed_tag) #가중치 작업 fav_tag *= SJ_FAV_TAG_WEIGHT view_tag *= SJ_VIEW_TAG_WEIGHT fav_topic *= SJ_FAV_TOPIC_WEIGHT view_topic *= SJ_VIEW_TOPIC_WEIGHT search_topic *= SJ_SEARCH_TOPIC_WEIGHT newsfeed_topic *= SJ_NEWSFEED_TOPIC_WEIGHT if len(USER['fav_list']) != 0: fav_topic /= len(USER['fav_list']) if len(USER['view_list']) != 0: view_topic /= len(USER['view_list']) #LDA Topic TOPIC_RESULT = (fav_topic + view_topic + search_topic + newsfeed_topic) / SJ_TOPIC_RESULT_DIV #FASTTEXT FastText_doc = fav_doc + view_doc + search_doc if FastText_doc: USER_VERCTOR = FastText.get_doc_vector(fav_doc + view_doc + search_doc).tolist() else: USER_VERCTOR = ft_vector = (np.zeros(FastText.VEC_SIZE)).tolist() #TAG tag_dict = dict(Counter(fav_tag + view_tag)) tag_dict = sorted(tag_dict.items(), key=lambda x: x[1]) #최종 태그들 오브젝트 TAG_RESULT = {} if len(tag_dict) >= 50: if tag_dict[0][1] == 1: tag_dict[0][1] = 2 TAG_RESULT[tag_dict[0][0]] = tag_dict[0][1] for i in range(1, 50): tag_dict[i] = list(tag_dict[i]) if (tag_dict[i - 1][1] * 1.5) < tag_dict[i][1]: tag_dict[i][1] = int(tag_dict[i - 1][1] * 1.5) TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1] elif len(tag_dict) > 0: if tag_dict[0][1] == 1: tag_dict[0][1] = 2 TAG_RESULT[tag_dict[0][0]] = tag_dict[0][1] for i in range(1, len(tag_dict)): tag_dict[i] = list(tag_dict[i]) if (tag_dict[i - 1][1] * 1.5) < tag_dict[i][1]: tag_dict[i][1] = int(tag_dict[i - 1][1] * 1.5) TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1] USER_TAG_SUM = sum(TAG_RESULT.values()) USER_TAG_SUM *= SJ_TAG_SUM_WEIGHT if USER_TAG_SUM == 0: USER_TAG_SUM = 1 # 사용자 태그로 사용자 태그 벡터 구하기 USER_TAGS = [] for key, value in TAG_RESULT.items(): USER_TAGS += [key] * value TAG_VECTOR = FastText.get_doc_vector(USER_TAGS).tolist() #해당 USER 관심도 갱신! update_user_measurement( db, USER['_id'], list(TOPIC_RESULT), TAG_RESULT, USER_TAG_SUM, TAG_VECTOR, USER_VERCTOR, len(USER['fav_list']) + len(USER['view_list']) + len(USER['search_list'])) update_variable(db, 'renewal', datetime.now()) if db_client is not None: db_client.close()
def measurement_run(): db_client = MongoClient('mongodb://%s:%s@%s' %(MONGODB_ID, MONGODB_PW, MONGODB_HOST)) db = db_client["soojle"] renewal_time = find_variable(db, 'renewal') #리뉴얼 시간보다 이상인 사람만 측정! (관심도 측정이 될 지표의 변동이 생겼다는 뜻!) USER_list = find_user_renewal(db, renewal_time) USER_list = list(USER_list) for USER in USER_list: fav_tag = [] view_tag = [] newsfeed_tag = [] fav_token = [] view_token = [] search_list = [] #사용자가 관심 기능을 수행한 게시물 ########################## fav_topic = (np.zeros(LDA.NUM_TOPICS)) for fav in USER['fav_list']: fav_topic += fav['topic'] fav_tag += fav['tag'] fav_token += fav['token'] #FAS 구하기 fav_doc = (fav_tag + fav_token) * 2 #사용자가 접근을 수행한 게시물 ############################## view_topic = (np.zeros(LDA.NUM_TOPICS)) for view in USER['view_list']: view_topic += view['topic'] view_tag += view['tag'] view_token += view['token'] #FAS 구하기 view_doc = view_tag + view_token #사용자가 검색을 수행한 키워드 ############################## for search_obj in USER['search_list'][:SJ_SEARCH_MEASURE_NUM]: search_list += search_obj['tokenizer_split'] search_topic = LDA.get_topics(search_list) search_doc = search_list #사용자가 접근한 뉴스피드 ################################ for newsfeed in USER['newsfeed_list']: newsfeed_tag += newsfeed['tag'] newsfeed_topic = LDA.get_topics(newsfeed_tag) #가중치 작업 fav_tag *= SJ_FAV_TAG_WEIGHT view_tag *= SJ_VIEW_TAG_WEIGHT fav_topic *= SJ_FAV_TOPIC_WEIGHT view_topic *= SJ_VIEW_TOPIC_WEIGHT search_topic *= SJ_SEARCH_TOPIC_WEIGHT newsfeed_topic *= SJ_NEWSFEED_TOPIC_WEIGHT if len(USER['fav_list']) != 0: fav_topic /= len(USER['fav_list']) if len(USER['view_list']) != 0: view_topic /= len(USER['view_list']) #LDA Topic TOPIC_RESULT = (fav_topic + view_topic + search_topic + newsfeed_topic)/SJ_TOPIC_RESULT_DIV #FASTTEXT FastText_doc = fav_doc + view_doc + search_doc if FastText_doc: USER_VERCTOR = FastText.get_doc_vector(fav_doc + view_doc + search_doc).tolist() else: USER_VERCTOR = ft_vector = (np.zeros(FastText.VEC_SIZE)).tolist() #TAG tag_dict = dict(Counter(fav_tag + view_tag)) tag_dict = sorted(tag_dict.items(), key=lambda x: x[1], reverse = True) #빈도수 랭킹 상위 X위 까지 보관. TAG_RESULT = {} if len(tag_dict) >= 10: for i in range(10): TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1] else: for i in range(len(tag_dict)): TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1] USER_TAG_SUM = sum(TAG_RESULT.values()) #1.5배 증가 USER_TAG_SUM *= SJ_TAG_SUM_WEIGHT #만약 TAG_SUM 이 0이면 1로 설정. if USER_TAG_SUM == 0: USER_TAG_SUM = 1 #해당 USER 관심도 갱신! (관심도 측정 횟수 +1) update_user_measurement(db, USER['_id'], list(TOPIC_RESULT), TAG_RESULT, USER_TAG_SUM, USER_VERCTOR, USER['measurement_num']+1) update_variable(db, 'renewal', datetime.now()) if db_client is not None: db_client.close()
predictions_headlines = simple_text_classification.logistic_regression_classification( usa, dutch_news['headlines_en']) predictions_headlines.columns = [ 'label_id_headline', 'probability_headline', 'label_headline' ] predictions_content = simple_text_classification.logistic_regression_classification( usa, dutch_news['content_en']) predictions_content.columns = [ 'label_id_content', 'probability_content', 'label_content' ] dutch_news_topics = pd.DataFrame( pd.concat([dutch_news, predictions_headlines, predictions_content], axis=1)) # FastText predictions_headlines_fasttext = FastText.fasttext_classification( usa, dutch_news['headlines_en']) predictions_headlines_fasttext.columns = [ 'f_label_headline', 'f_probability_headline', 'f_label_id_headline' ] predictions_content_fasttext = FastText.fasttext_classification( usa, dutch_news['content_en']) predictions_content_fasttext.columns = [ 'f_label_content', 'f_probability_content', 'f_label_id_content' ] dutch_news_topics = pd.DataFrame( pd.concat([ dutch_news_topics, predictions_headlines_fasttext, predictions_content_fasttext ], axis=1))
print("设置参数") #获取数据参数 # path = './data/nlpmail_re3.txt' path="./data/nlpmaildatasample2.csv" #数据输入 w2vpath = "./data/w2c_model" #w2v模型地址 embedding_dims = 128 # 词向量长度 logpath='./model/mylog.txt' #日志记录地址 modelpath='./model/' #模型保存目录 #模型训练参数 batch_size = 32 epochs = 100 #fastText参数 ngram_range=2 #TextRCNNmodel参数 hidden_dim_1 = 200 hidden_dim_2 = 100 print("获取数据") x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix=dataPreprocess.getdata(path,embedding_dims,w2vpath) print("调用模型") FastText.getdata_train(path,ngram_range,maxlen+10,max_token,embedding_dims,batch_size,epochs,logpath,modelpath,"FastText") TextCNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextCNN") TextRNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextSimpleRNN") TextRNNmodel.train2(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextBiLSTM") TextRNNmodel.train3(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextBiGRU") TextRCNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextRCNN",hidden_dim_1,hidden_dim_2) TextAttention.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextAttention") MyModel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"MyConBiGRU")