def is_English(text=None): if text is not None: try: text.encode('ascii') except UnicodeEncodeError: # print("it was not a ascii-encoded unicode string") return False else: # print("It may have been an ascii-encoded unicode string") return True else: # print('The input string is None.') return if is_English(text): return cnn(text) else: return cnn_Chinese(text) if __name__ == "__main__": from load_data import load_corpus data = load_corpus('../data/corpus/cn/corpus_raw/') for i in data: print(''.join(i)) print(CNN_VA_prediction(''.join(i))) exit() text = '我今天特別高興' print(CNN_VA_prediction(text)) text = 'appy B-day Jim Price!! :-) (you are more awesome than you could dream) Hope today was the best ever!! :-D' print(CNN_VA_prediction(text))
import numpy as np def get_vocab(corpus): vocab = defaultdict(float) for sent in corpus: for word in sent: vocab[word] += 1 print(len(vocab)) return vocab # 注意: 这个文件就是CVAT构造cnn输入数据的代码 ########################################## config ######################################## vec_dim = 400 ########################################################################################## corpus = load_corpus(get_file_path('cn_corpus')) print(corpus[:2]) vocab = get_vocab(corpus) dump_picle(vocab, get_file_path('CVAT_Vocab')) print('Dump CVAT vocab OK') # vocab = load_pickle(get_file_path('CVAT_Vocab')) for i in vocab: print(i) print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) print('dump word_idx_map successful') dump_picle(W, './data/tmp/embedding_matrix_CVAT.p') print('OK')
def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='ordinary_least_squares') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Bayesian_Regression') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='SVR') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='KNN_Reg') if __name__ == '__main__': normalize = True corpus = load_corpus(get_file_path('cn_corpus')) # lexicon = load_lexicon(get_file_path('lexicon')) mark = load_mark(get_file_path('mark')) lexicon = combine_lexicon(get_file_path('lexicon'), get_file_path('neural_cand')) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) print('start.....') cv(valence_mean, valence_true, multivariant=False) cv(arousal_mean, arousal_true, multivariant=False) print('OK')
logging.root.setLevel(level=logging.INFO) logger.info(r"running %s" % ''.join(sys.argv)) corpus_name = get_file_path('cn_corpus') logger.info(r"loading corpus from : " + corpus_name) lexicon_name = get_file_path('lexicon') logger.info(r"loading lexicon form : " + lexicon_name) expand_name = get_file_path('neural_cand') logger.info(r"loading expand_word from : " + expand_name) mark_name = get_file_path('mark') logger.info(r"loading mark from : " + mark_name) corpus = load_corpus(corpus_name) lexicon = load_lexicon(lexicon_name) mark = load_mark(mark_name) # log_state('use extend lexicon') lexicon = combine_lexicon(lexicon_name, expand_name) log_state('mean') evaluate_mean(corpus, lexicon, mark) log_state('tf_mean') evaluate_tf_mean(corpus, lexicon, mark) log_state('tfidf_mean') evaluate_tfidf_mean(corpus, lexicon, mark) log_state('geo') evaluate_geo(corpus, lexicon, mark) log_state('tfidf_geo')
#这是用来给youtube2016那篇论文的模型构造数据用的,我也跑了一遍,但是效果奇差。。。 import tensorflow as tf import numpy as np import pandas as pd import random from functiontool.getrelation import getrelation from functiontool.dirkit import getdir from functiontool.baseinfo import getlive from sklearn import metrics import math from load_data import load_corpus, index_item, get_live_vec sentences, words, validatewords, user = load_corpus('corpus.txt', usertxt='./sample/dir.txt') assert len(sentences) == len(user) index2item, item2index, vocabulary = index_item(words) user_num = len(sentences) item_num = len(words) vocabulary_size = len(vocabulary) print(vocabulary_size) # sentences : 序列中的元素为live原始id # words: 按热度排序的live列表,带有词频率,所以是个元组 # validatewords: 选取最后的几个作为抹去的对象,这些对象作为验证集使用 # user: 和sentences索引对应的用户id列表 # vocabulary: 按词频排序的列表 #已经获取了所有的索引信息,至此,数据集索引加载完成,且没有划分 print('所有内容报名完成,下面开始加载数据') #下面加载基本信息
# plot hourly distribution for different parts of the week hist_comb = hist_hourly_weekday_weekend(df) plot_custom(hist_comb, 'line', title='Hourly Submissions Over the Weekdays', file='hist_hourly_weekday_weekend', xlabel='Hour', ylabel='Count', colors=['tab:red', 'tab:green'], show=False) # word corpus analysis df_collection = list() # individual corpus analysis plot stress_words = load_corpus('data/stress_corpus.txt') df_result = keywords_over_quarters( stress_words, calendar, df, title='Stress & Depression over typical Quarter', file='plot_stress', colors=['lightgrey', 'tab:red', 'tab:green'], show=False) df_collection.append(('Stress/Depression', df_result)) thirst_words = load_corpus('data/thirst_corpus.txt') df_result = keywords_over_quarters( thirst_words, calendar, df,
def __init__(self, file_dir): self.file_dir = file_dir self.corpus = load_corpus(file_dir)
def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="ordinary_least_squares") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Ridge_Regression") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Bayesian_Regression") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="SVR") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="KNN_Reg") if __name__ == "__main__": normalize = True corpus = load_corpus(get_file_path("cn_corpus")) # lexicon = load_lexicon(get_file_path('lexicon')) mark = load_mark(get_file_path("mark")) lexicon = combine_lexicon(get_file_path("lexicon"), get_file_path("neural_cand")) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) print("start.....") cv(valence_mean, valence_true, multivariant=False) cv(arousal_mean, arousal_true, multivariant=False) print("OK")
#基本信息的标准化,之后存入csv from load_data import load_corpus,index_item import pandas as pd import numpy as np from functiontool.baseinfo import getlive sentences,words,validatewords,user = load_corpus('corpus_all.txt',usertxt='dir.txt') assert len(sentences)==len(user) index2item,item2index,vocabulary=index_item(words) vocabulary_size=len(vocabulary) future_batch= 5 print(vocabulary_size) print(user[:30]) # sentences : 序列中的元素为live原始id # words: 按热度排序的live列表,带有词频率,所以是个元组 # validatewords: 选取最后的几个作为抹去的对象,这些对象作为验证集使用 # user: 和sentences索引对应的用户id列表 # vocabulary: 按词频排序的列表 #已经获取了所有的索引信息,至此,数据集索引加载完成,且没有划分 print('所有内容报名完成,下面开始加载数据') #下面加载基本信息 def get_user_base(user): from sqlalchemy import create_engine base_engine = create_engine("mysql+pymysql://root:[email protected]:3306/zhihu", max_overflow=5) print('读取40万用户开始') df = pd.read_sql('user',base_engine) df.set_index(["id"], inplace=True) print('读取完成')