示例#1
0
def create_category_of_topic_tag_vector(db):
    #category_of_topic 전체 호출
    category_of_topic_list = db[SJ_DB_CATEGORY].find(
        {'category_name': {
            '$in': list(SJ_CATEGORY_OF_TOPIC_SET)
        }}, {'_id': 0})
    category_of_topic_list = list(category_of_topic_list)

    for category in category_of_topic_list:
        category_tag_vector = FastText.get_doc_vector(category['tag']).tolist()

        #category_of_topic 콜렉션에 tag_vectort 추가!
        db[SJ_DB_CATEGORY].update(
            {'category_name': category['category_name']},
            {'$set': {
                'tag_vector': category_tag_vector
            }})
import simple_text_classification
import preprocessing
import FastText
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# read the train data
usa = preprocessing.train_data(23)
usa.to_csv("usa.csv", index=False)
usa['category_id'] = usa['tag'].factorize()[0]

lr_pred = simple_text_classification.cross_validation_logistic_regression(usa)
lr_pred.columns = ['label_id_lr', 'proba_lr', 'label_lr']

ft_pred = FastText.cross_validation_fasttext(usa)
ft_pred.columns = ['label_ft', 'proba_ft', 'label_id_ft']

# ensemble stacking method
features = pd.concat([lr_pred['label_id_lr'], ft_pred['label_id_ft']], axis=1)
labels = usa.category_id
[X_train, X_test, y_train, y_test] = train_test_split(features,
                                                      labels,
                                                      test_size=0.30,
                                                      random_state=0)
model = LogisticRegression(random_state=1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

# pick the class with the highest probability
predictions = pd.concat([
示例#3
0
def SJ_interest_measurement_run():
    db_client = MongoClient('mongodb://%s:%s@%s' %
                            (MONGODB_ID, MONGODB_PW, MONGODB_HOST))
    db = db_client["soojle"]

    renewal_time = find_variable(db, 'renewal')

    USER_list = find_user_renewal(db, renewal_time)
    USER_list = list(USER_list)

    ACTION_DAY_CHECK = get_default_day(SJ_USER_ACTION_DAY_CHECK)

    CATEGORY_list = find_all_category_of_topic(db)
    CATEGORY_list = list(CATEGORY_list)
    CATEGORY = {}
    #뉴스피드 관심도 태그 구하기에서 빠른 접근을 위해서 dict형식으로 변환
    for cate in CATEGORY_list:
        CATEGORY[cate['category_name']] = cate['tag']

    for USER in USER_list:
        #좋아요/조회수가 하나도 없는 회원은 측정 안함.
        if (len(USER['fav_list']) == 0) and (len(USER['view_list']) == 0):
            continue

        user_log_backup(db, USER)

        fav_tag = []
        view_tag = []
        newsfeed_tag = []
        fav_token = []
        view_token = []
        search_list = []

        #사용자가 관심 기능을 수행한 게시물 ##########################
        fav_topic = (np.zeros(LDA.NUM_TOPICS))
        if len(
                USER['fav_list']
        ) <= SJ_USER_LOG_LIMIT['fav'] * SJ_USER_ACTION_NUM_CHECK_PERCENT:
            for fav in USER['fav_list']:
                fav_topic += fav['topic']
                fav_tag += fav['tag']
                fav_token += fav['token']
        else:
            for fav in USER['fav_list']:
                if fav['date'] < ACTION_DAY_CHECK: continue
                fav_topic += fav['topic']
                fav_tag += fav['tag']
                fav_token += fav['token']

        #FAS 구하기
        fav_doc = (fav_tag + fav_token) * 2

        #사용자가 접근을 수행한 게시물 ##############################
        view_topic = (np.zeros(LDA.NUM_TOPICS))
        if len(
                USER['view_list']
        ) <= SJ_USER_LOG_LIMIT['view'] * SJ_USER_ACTION_NUM_CHECK_PERCENT:
            for view in USER['view_list']:
                view_topic += view['topic']
                view_tag += view['tag']
                view_token += view['token']
        else:
            for view in USER['view_list']:
                if view['date'] < ACTION_DAY_CHECK: continue
                view_topic += view['topic']
                view_tag += view['tag']
                view_token += view['token']

        #FAS 구하기
        view_doc = view_tag + view_token

        #사용자가 검색을 수행한 키워드 ##############################
        if len(
                USER['search_list']
        ) <= SJ_USER_LOG_LIMIT['search'] * SJ_USER_ACTION_NUM_CHECK_PERCENT:
            for search in USER['search_list']:
                search_list += search['tokenizer_split']
        else:
            for search in USER['search_list']:
                if search['date'] < ACTION_DAY_CHECK: continue
                search_list += search['tokenizer_split']

        search_topic = LDA.get_topics(search_list)
        search_doc = search_list

        #사용자가 접근한 뉴스피드 ################################
        A_NUM = 0  #대학교
        B_NUM = 0  #동아리&모임
        C_NUM = 0  #공모전&행사
        D_NUM = 0  #진로&구인
        E_NUM = 0  #자유

        if len(
                USER['newsfeed_list']
        ) <= SJ_USER_LOG_LIMIT['newsfeed'] * SJ_USER_ACTION_NUM_CHECK_PERCENT:
            for newsfeed in USER['newsfeed_list']:
                if newsfeed['newsfeed_name'] == '대학교': A_NUM += 1
                elif newsfeed['newsfeed_name'] == '동아리&모임': B_NUM += 1
                elif newsfeed['newsfeed_name'] == '공모전&행사': C_NUM += 1
                elif newsfeed['newsfeed_name'] == '진로&구인': D_NUM += 1
                else: E_NUM += 1
        else:
            for newsfeed in USER['newsfeed_list']:
                if newsfeed['date'] < ACTION_DAY_CHECK: continue

                if newsfeed['newsfeed_name'] == '대학교': A_NUM += 1
                elif newsfeed['newsfeed_name'] == '동아리&모임': B_NUM += 1
                elif newsfeed['newsfeed_name'] == '공모전&행사': C_NUM += 1
                elif newsfeed['newsfeed_name'] == '진로&구인': D_NUM += 1
                else: E_NUM += 1

                newsfeed_tag += newsfeed['tag']

        newsfeed_tag += (CATEGORY['대학교'] * A_NUM) + (
            CATEGORY['동아리&모임'] * B_NUM) + (CATEGORY['공모전&행사'] * C_NUM) + (
                CATEGORY['진로&구인'] * D_NUM) + (CATEGORY['커뮤니티'] * E_NUM)

        newsfeed_topic = LDA.get_topics(newsfeed_tag)

        #가중치 작업
        fav_tag *= SJ_FAV_TAG_WEIGHT
        view_tag *= SJ_VIEW_TAG_WEIGHT

        fav_topic *= SJ_FAV_TOPIC_WEIGHT
        view_topic *= SJ_VIEW_TOPIC_WEIGHT
        search_topic *= SJ_SEARCH_TOPIC_WEIGHT
        newsfeed_topic *= SJ_NEWSFEED_TOPIC_WEIGHT

        if len(USER['fav_list']) != 0:
            fav_topic /= len(USER['fav_list'])

        if len(USER['view_list']) != 0:
            view_topic /= len(USER['view_list'])

        #LDA Topic
        TOPIC_RESULT = (fav_topic + view_topic + search_topic +
                        newsfeed_topic) / SJ_TOPIC_RESULT_DIV

        #FASTTEXT
        FastText_doc = fav_doc + view_doc + search_doc

        if FastText_doc:
            USER_VERCTOR = FastText.get_doc_vector(fav_doc + view_doc +
                                                   search_doc).tolist()
        else:
            USER_VERCTOR = ft_vector = (np.zeros(FastText.VEC_SIZE)).tolist()

        #TAG
        tag_dict = dict(Counter(fav_tag + view_tag))
        tag_dict = sorted(tag_dict.items(), key=lambda x: x[1])

        #최종 태그들 오브젝트
        TAG_RESULT = {}

        if len(tag_dict) >= 50:
            if tag_dict[0][1] == 1:
                tag_dict[0][1] = 2

            TAG_RESULT[tag_dict[0][0]] = tag_dict[0][1]

            for i in range(1, 50):
                tag_dict[i] = list(tag_dict[i])

                if (tag_dict[i - 1][1] * 1.5) < tag_dict[i][1]:
                    tag_dict[i][1] = int(tag_dict[i - 1][1] * 1.5)

                TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1]

        elif len(tag_dict) > 0:
            if tag_dict[0][1] == 1:
                tag_dict[0][1] = 2

            TAG_RESULT[tag_dict[0][0]] = tag_dict[0][1]

            for i in range(1, len(tag_dict)):
                tag_dict[i] = list(tag_dict[i])

                if (tag_dict[i - 1][1] * 1.5) < tag_dict[i][1]:
                    tag_dict[i][1] = int(tag_dict[i - 1][1] * 1.5)

                TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1]

        USER_TAG_SUM = sum(TAG_RESULT.values())

        USER_TAG_SUM *= SJ_TAG_SUM_WEIGHT

        if USER_TAG_SUM == 0:
            USER_TAG_SUM = 1

        # 사용자 태그로 사용자 태그 벡터 구하기
        USER_TAGS = []
        for key, value in TAG_RESULT.items():
            USER_TAGS += [key] * value
        TAG_VECTOR = FastText.get_doc_vector(USER_TAGS).tolist()

        #해당 USER 관심도 갱신!
        update_user_measurement(
            db, USER['_id'], list(TOPIC_RESULT), TAG_RESULT, USER_TAG_SUM,
            TAG_VECTOR, USER_VERCTOR,
            len(USER['fav_list']) + len(USER['view_list']) +
            len(USER['search_list']))

    update_variable(db, 'renewal', datetime.now())

    if db_client is not None:
        db_client.close()
示例#4
0
def measurement_run():
	db_client = MongoClient('mongodb://%s:%s@%s' %(MONGODB_ID, MONGODB_PW, MONGODB_HOST))
	db = db_client["soojle"]

	renewal_time = find_variable(db, 'renewal')

	#리뉴얼 시간보다 이상인 사람만 측정! (관심도 측정이 될 지표의 변동이 생겼다는 뜻!)
	USER_list = find_user_renewal(db, renewal_time)
	USER_list = list(USER_list)

	for USER in USER_list:
		fav_tag = []
		view_tag = []
		newsfeed_tag = []
		fav_token = []
		view_token = []
		search_list = []

		#사용자가 관심 기능을 수행한 게시물 ##########################
		fav_topic = (np.zeros(LDA.NUM_TOPICS))
		for fav in USER['fav_list']:
			fav_topic += fav['topic']
			fav_tag += fav['tag']
			fav_token += fav['token']

		#FAS 구하기
		fav_doc = (fav_tag + fav_token) * 2

		#사용자가 접근을 수행한 게시물 ##############################
		view_topic = (np.zeros(LDA.NUM_TOPICS))
		for view in USER['view_list']:
			view_topic += view['topic']
			view_tag += view['tag']
			view_token += view['token']

		#FAS 구하기
		view_doc = view_tag + view_token

		#사용자가 검색을 수행한 키워드 ##############################
		for search_obj in USER['search_list'][:SJ_SEARCH_MEASURE_NUM]:
			search_list += search_obj['tokenizer_split']
		
		search_topic = LDA.get_topics(search_list)
		search_doc = search_list

		#사용자가 접근한 뉴스피드 ################################
		for newsfeed in USER['newsfeed_list']:
			newsfeed_tag += newsfeed['tag']

		newsfeed_topic = LDA.get_topics(newsfeed_tag)


		#가중치 작업
		fav_tag *= SJ_FAV_TAG_WEIGHT
		view_tag *= SJ_VIEW_TAG_WEIGHT
		
		fav_topic *= SJ_FAV_TOPIC_WEIGHT
		view_topic *= SJ_VIEW_TOPIC_WEIGHT
		search_topic *= SJ_SEARCH_TOPIC_WEIGHT
		newsfeed_topic *= SJ_NEWSFEED_TOPIC_WEIGHT

		if len(USER['fav_list']) != 0:
			fav_topic /= len(USER['fav_list'])
		
		if len(USER['view_list']) != 0:
			view_topic /= len(USER['view_list'])

		#LDA Topic
		TOPIC_RESULT = (fav_topic + view_topic + search_topic + newsfeed_topic)/SJ_TOPIC_RESULT_DIV

		#FASTTEXT
		FastText_doc = fav_doc + view_doc + search_doc

		if FastText_doc:
			USER_VERCTOR = FastText.get_doc_vector(fav_doc + view_doc + search_doc).tolist()
		else:
			USER_VERCTOR = ft_vector = (np.zeros(FastText.VEC_SIZE)).tolist()
			
		#TAG
		tag_dict = dict(Counter(fav_tag + view_tag))
		tag_dict = sorted(tag_dict.items(), key=lambda x: x[1], reverse = True)
		
		#빈도수 랭킹 상위 X위 까지 보관.
		TAG_RESULT = {}

		if len(tag_dict) >= 10:
			for i in range(10):
				TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1]
		else:
			for i in range(len(tag_dict)):
				TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1]
					
		USER_TAG_SUM = sum(TAG_RESULT.values())

		#1.5배 증가
		USER_TAG_SUM *= SJ_TAG_SUM_WEIGHT

		#만약 TAG_SUM 이 0이면 1로 설정.
		if USER_TAG_SUM == 0:
			USER_TAG_SUM = 1

		#해당 USER 관심도 갱신! (관심도 측정 횟수 +1)
		update_user_measurement(db, USER['_id'], list(TOPIC_RESULT), TAG_RESULT, USER_TAG_SUM, USER_VERCTOR, USER['measurement_num']+1)

	update_variable(db, 'renewal', datetime.now())

	if db_client is not None:
		db_client.close()
示例#5
0
predictions_headlines = simple_text_classification.logistic_regression_classification(
    usa, dutch_news['headlines_en'])
predictions_headlines.columns = [
    'label_id_headline', 'probability_headline', 'label_headline'
]
predictions_content = simple_text_classification.logistic_regression_classification(
    usa, dutch_news['content_en'])
predictions_content.columns = [
    'label_id_content', 'probability_content', 'label_content'
]
dutch_news_topics = pd.DataFrame(
    pd.concat([dutch_news, predictions_headlines, predictions_content],
              axis=1))

# FastText
predictions_headlines_fasttext = FastText.fasttext_classification(
    usa, dutch_news['headlines_en'])
predictions_headlines_fasttext.columns = [
    'f_label_headline', 'f_probability_headline', 'f_label_id_headline'
]
predictions_content_fasttext = FastText.fasttext_classification(
    usa, dutch_news['content_en'])
predictions_content_fasttext.columns = [
    'f_label_content', 'f_probability_content', 'f_label_id_content'
]
dutch_news_topics = pd.DataFrame(
    pd.concat([
        dutch_news_topics, predictions_headlines_fasttext,
        predictions_content_fasttext
    ],
              axis=1))
print("设置参数")
#获取数据参数
# path = './data/nlpmail_re3.txt'
path="./data/nlpmaildatasample2.csv" #数据输入
w2vpath = "./data/w2c_model"  #w2v模型地址
embedding_dims = 128  # 词向量长度
logpath='./model/mylog.txt' #日志记录地址
modelpath='./model/' #模型保存目录
#模型训练参数
batch_size = 32
epochs = 100
#fastText参数
ngram_range=2
#TextRCNNmodel参数
hidden_dim_1 = 200
hidden_dim_2 = 100


print("获取数据")
x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix=dataPreprocess.getdata(path,embedding_dims,w2vpath)

print("调用模型")
FastText.getdata_train(path,ngram_range,maxlen+10,max_token,embedding_dims,batch_size,epochs,logpath,modelpath,"FastText")
TextCNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextCNN")
TextRNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextSimpleRNN")
TextRNNmodel.train2(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextBiLSTM")
TextRNNmodel.train3(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextBiGRU")
TextRCNNmodel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextRCNN",hidden_dim_1,hidden_dim_2)
TextAttention.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"TextAttention")
MyModel.train(x_train, y_train, x_test, y_test,maxlen,max_token,embedding_matrix,embedding_dims,batch_size,epochs,logpath,modelpath,"MyConBiGRU")